tuulos / disco
- Source
- Commits
- Network (33)
- Issues (24)
- Downloads (8)
- Wiki (2)
- Graphs
-
Branch:
master
Ville Tuulos (author)
Sun Oct 04 22:10:52 -0700 2009
commit 4ebdbf4ab738dfdeba8e0a1245aed709f0eec227
tree d1aaf28bd7550dea44f7b12360dec2893a292fb0
parent 54eaa9ffeed3b3bd2690acb69d1fbcaa92646f52
tree d1aaf28bd7550dea44f7b12360dec2893a292fb0
parent 54eaa9ffeed3b3bd2690acb69d1fbcaa92646f52
| b2f159bc » | Taneli Mielikäinen | 2008-09-29 | 1 | import disco | |
| 2 | |||||
| 3 | def estimate_map(e, params): | ||||
| 4 | z=dict([(elem,1) for elem in e[1].split(params.splitter)]).keys() | ||||
| 5 | x=[a for a in z if not a in params.ys] | ||||
| 6 | y=[a for a in z if a in params.ys] | ||||
| 7 | |||||
| 8 | return [(b+params.splitter+a,1) for a in x for b in y] + [(a,1) for a in z] + [('',1)] | ||||
| 9 | #[(b+params.splitter,1) for b in y] + [(params.splitter+a,1) for a in x] + [(params.splitter,1)] | ||||
| 10 | |||||
| 11 | |||||
| 12 | def estimate_combiner(k, v, counts, done, params): | ||||
| 13 | if done: return counts.items() | ||||
| 14 | |||||
| 15 | if not counts.has_key(k): counts[k]=v | ||||
| 16 | else: counts[k]+=v | ||||
| 17 | |||||
| 18 | |||||
| 19 | def estimate_reduce(iter, out, params): | ||||
| 20 | counts={} | ||||
| 21 | |||||
| 22 | for k,v in iter: | ||||
| 23 | v=int(v) | ||||
| 24 | if not counts.has_key(k): counts[k]=v | ||||
| 25 | else: counts[k]+=v | ||||
| 26 | |||||
| 27 | for k,v in counts.iteritems(): out.add(k,repr(v)) | ||||
| 28 | |||||
| 29 | |||||
| 30 | def predict_map(e, params): | ||||
| 31 | ll=dict([(k,params.loglikelihoods[k]) for k in params.ys.keys()]) | ||||
| 32 | |||||
| 33 | for elem in e[1].split(params.splitter): | ||||
| 34 | if params.ys.has_key(elem): continue | ||||
| 35 | |||||
| 36 | for y in params.ys: | ||||
| 37 | ll[y]+=params.loglikelihoods[y+params.splitter+elem] | ||||
| 38 | |||||
| 39 | return [(e[0], k + ' ' + repr(ll[k])) for k in params.ys] | ||||
| 40 | |||||
| 41 | |||||
| 42 | def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader): | ||||
| 43 | ys=dict([(id,1) for id in ys]) | ||||
| 44 | |||||
| 45 | results = disco.job(host, name = 'naive_bayes_estimate', | ||||
| 46 | input_files = input, | ||||
| 47 | map_reader = map_reader, | ||||
| 48 | fun_map = estimate_map, | ||||
| 49 | combiner = estimate_combiner, | ||||
| 50 | reduce = estimate_reduce, | ||||
| 51 | params = disco.Params(ys=ys,splitter=splitter), | ||||
| 52 | sort = False, clean = False) | ||||
| 53 | |||||
| 54 | total=0 | ||||
| 55 | items={} | ||||
| 56 | classes={} | ||||
| 57 | pairs={} | ||||
| 58 | for key,value in disco.result_iterator(results): | ||||
| 59 | l=key.split(splitter) | ||||
| 60 | value=int(value) | ||||
| 61 | if len(l)==1: | ||||
| 62 | if l[0]=='': total=value | ||||
| 63 | elif ys.has_key(l[0]): classes[l[0]]=value | ||||
| 64 | else: items[l[0]]=value | ||||
| 65 | else: | ||||
| 66 | pairs[key]=value | ||||
| 67 | |||||
| 68 | #counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]] | ||||
| 69 | counts={} | ||||
| 70 | for i in items: | ||||
| 71 | for y in ys: | ||||
| 72 | key=y+splitter+i | ||||
| 73 | counts[key]=[0,0,0,0] | ||||
| 74 | if pairs.has_key(key): counts[key][0]=pairs[key] | ||||
| 75 | counts[key][1]=items[i]-counts[key][0] | ||||
| 76 | if not classes.has_key(y): counts[key][2]=0 | ||||
| 77 | else: counts[key][2]=classes[y]-counts[key][0] | ||||
| 78 | counts[key][3]=total-sum(counts[key][:3]) | ||||
| 79 | |||||
| 80 | # add pseudocounts | ||||
| 81 | counts[key]=map(lambda x: x+1, counts[key]) | ||||
| 82 | total+=4 | ||||
| 83 | |||||
| 84 | import math | ||||
| 85 | loglikelihoods={} | ||||
| 86 | for key,value in counts.iteritems(): | ||||
| 87 | log_c=math.log(value[0]+value[2]) | ||||
| 88 | l=key.split(splitter) | ||||
| 89 | if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0 | ||||
| 90 | loglikelihoods[l[0]]+=math.log(value[2])-log_c | ||||
| 91 | loglikelihoods[key]=math.log(value[0])-math.log(value[2]) | ||||
| 92 | |||||
| 93 | return loglikelihoods | ||||
| 94 | |||||
| 95 | |||||
| 96 | def predict(input, loglikelihoods, ys, splitter, host="disco://localhost", map_reader=disco.chain_reader): | ||||
| 97 | ys=dict([(id,1) for id in ys]) | ||||
| 98 | results = disco.job(host, name = 'naive_bayes_predict', | ||||
| 99 | input_files = input, | ||||
| 100 | map_reader = map_reader, | ||||
| 101 | fun_map = predict_map, | ||||
| 102 | params=disco.Params(loglikelihoods=loglikelihoods,ys=ys,splitter=splitter), | ||||
| 103 | sort = False, clean = False) | ||||
| 104 | |||||
| 105 | return results | ||||
