public
Description: a Map/Reduce framework for distributed computing
Homepage: http://discoproject.org
Clone URL: git://github.com/tuulos/disco.git
disco / examples / datamining / kmeans.py
b2f159bc » Taneli Mielikäinen 2008-09-29 perceptron linear classifie... 1 import disco
2
3 def init_map(e, params):
4 import random
5 return [(random.randint(0,params.k-1),e[1])]
6
7
8 def estimate_map(e, params):
9 return[(min([(params.dist(c,map(float,e[1].split(' '))),i) for (i,c) in enumerate(params.centers)])[1],e[1])]
10
11
12 def estimate_combiner(k, v, centers, done, params):
13 if done:
14 return [(i,' '.join(map(repr,c))) for (i,c) in centers.iteritems()]
15 else:
16 v=map(float,v.split(' '))
17 if not centers.has_key(k): centers[k]=[0.0]*len(v) + [0]
18 for i in range(len(v)): centers[k][i]+=v[i]
19
20 centers[k][len(v)]+=1
21
22
23 def estimate_reduce(iter, out, params):
24 x={}
25 for k,v in iter:
26 y=map(float,v.split(' '))
27 if not x.has_key(k):
28 x[k]=y
29 else:
30 for i in y: x[k][i]+=y[i]
31
32 for k,v in x.iteritems():
33 for i in range(len(v)-1): v[i]/=v[-1]
34 out.add(k,' '.join(map(repr,v)))
35
36
37 def predict_map(e, params):
38 return [(e[0],min([(params.dist(c,map(float,map(float,e[1].split(' ')))),i) for (i,c) in enumerate(params.centers)])[1])]
39
40
41 def d2(x,y): return sum([(x[i]-y[i])**2 for i in range(len(x))])
42
43
44 def estimate(input, centers, k, iterations=10, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None):
45 if centers!=None: k=len(centers)
46 if nr_reduces==None: nr_reduces=k
47
48 results=None
49 if centers==None:
50 results = disco.job(host, name = 'kmeans_init',
51 input_files = input,
52 map_reader = map_reader,
53 fun_map = init_map,
54 combiner = estimate_combiner,
55 reduce = estimate_reduce,
56 nr_reduces = nr_reduces,
57 params = disco.Params(k=k),
58 sort = False, clean = True)
59
60 for i in range(iterations):
61 if results!=None:
62 centers=[None]*k
63 counts=[None]*k
64 for key,value in disco.result_iterator(results):
65 x=map(float,value.split(' '))
66 centers[int(key)]=x[:-1]
67 counts[int(key)]=x[-1]
68
69 results = disco.job(host, name = 'kmeans_iterate_'+str(i),
70 input_files = input,
71 map_reader = map_reader,
72 fun_map = estimate_map,
73 combiner = estimate_combiner,
74 reduce = estimate_reduce,
75 nr_reduces = nr_reduces,
76 params = disco.Params(centers=centers,dist=d2),
77 sort = False, clean = True)
78
79 return centers
80
81
82 def predict(input, centers, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None):
83 if nr_reduces==None: nr_reduces=len(centers)
84
85 results = disco.job(host, name = 'kmeans_output',
86 input_files = input,
87 map_reader = map_reader,
88 fun_map = predict_map,
89 nr_reduces = nr_reduces,
90 params = disco.Params(centers=centers,dist=d2),
91 sort = False, clean = True)
92
93 return results