tuulos / disco

a Map/Reduce framework for distributed computing

This URL has Read+Write access

Ville Tuulos (author)
Sun Oct 04 22:10:52 -0700 2009
commit  4ebdbf4ab738dfdeba8e0a1245aed709f0eec227
tree    d1aaf28bd7550dea44f7b12360dec2893a292fb0
parent  54eaa9ffeed3b3bd2690acb69d1fbcaa92646f52
disco / examples / datamining / naive_bayes.py
b2f159bc » Taneli Mielikäinen 2008-09-29 perceptron linear classifie... 1 import disco
2
3 def estimate_map(e, params):
4 z=dict([(elem,1) for elem in e[1].split(params.splitter)]).keys()
5 x=[a for a in z if not a in params.ys]
6 y=[a for a in z if a in params.ys]
7
8 return [(b+params.splitter+a,1) for a in x for b in y] + [(a,1) for a in z] + [('',1)]
9 #[(b+params.splitter,1) for b in y] + [(params.splitter+a,1) for a in x] + [(params.splitter,1)]
10
11
12 def estimate_combiner(k, v, counts, done, params):
13 if done: return counts.items()
14
15 if not counts.has_key(k): counts[k]=v
16 else: counts[k]+=v
17
18
19 def estimate_reduce(iter, out, params):
20 counts={}
21
22 for k,v in iter:
23 v=int(v)
24 if not counts.has_key(k): counts[k]=v
25 else: counts[k]+=v
26
27 for k,v in counts.iteritems(): out.add(k,repr(v))
28
29
30 def predict_map(e, params):
31 ll=dict([(k,params.loglikelihoods[k]) for k in params.ys.keys()])
32
33 for elem in e[1].split(params.splitter):
34 if params.ys.has_key(elem): continue
35
36 for y in params.ys:
37 ll[y]+=params.loglikelihoods[y+params.splitter+elem]
38
39 return [(e[0], k + ' ' + repr(ll[k])) for k in params.ys]
40
41
42 def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader):
43 ys=dict([(id,1) for id in ys])
44
45 results = disco.job(host, name = 'naive_bayes_estimate',
46 input_files = input,
47 map_reader = map_reader,
48 fun_map = estimate_map,
49 combiner = estimate_combiner,
50 reduce = estimate_reduce,
51 params = disco.Params(ys=ys,splitter=splitter),
52 sort = False, clean = False)
53
54 total=0
55 items={}
56 classes={}
57 pairs={}
58 for key,value in disco.result_iterator(results):
59 l=key.split(splitter)
60 value=int(value)
61 if len(l)==1:
62 if l[0]=='': total=value
63 elif ys.has_key(l[0]): classes[l[0]]=value
64 else: items[l[0]]=value
65 else:
66 pairs[key]=value
67
68 #counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]]
69 counts={}
70 for i in items:
71 for y in ys:
72 key=y+splitter+i
73 counts[key]=[0,0,0,0]
74 if pairs.has_key(key): counts[key][0]=pairs[key]
75 counts[key][1]=items[i]-counts[key][0]
76 if not classes.has_key(y): counts[key][2]=0
77 else: counts[key][2]=classes[y]-counts[key][0]
78 counts[key][3]=total-sum(counts[key][:3])
79
80 # add pseudocounts
81 counts[key]=map(lambda x: x+1, counts[key])
82 total+=4
83
84 import math
85 loglikelihoods={}
86 for key,value in counts.iteritems():
87 log_c=math.log(value[0]+value[2])
88 l=key.split(splitter)
89 if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0
90 loglikelihoods[l[0]]+=math.log(value[2])-log_c
91 loglikelihoods[key]=math.log(value[0])-math.log(value[2])
92
93 return loglikelihoods
94
95
96 def predict(input, loglikelihoods, ys, splitter, host="disco://localhost", map_reader=disco.chain_reader):
97 ys=dict([(id,1) for id in ys])
98 results = disco.job(host, name = 'naive_bayes_predict',
99 input_files = input,
100 map_reader = map_reader,
101 fun_map = predict_map,
102 params=disco.Params(loglikelihoods=loglikelihoods,ys=ys,splitter=splitter),
103 sort = False, clean = False)
104
105 return results