public
Description: a Map/Reduce framework for distributed computing
Homepage: http://discoproject.org
Clone URL: git://github.com/tuulos/disco.git
Ville Tuulos (author)
Thu Jun 18 22:00:52 -0700 2009
commit  dfa6e76ffd729063ba3c221d2db655686fd59328
tree    c7b84b52e704ef95d5223353592a241282b590af
parent  c254f05ebf8a27358562e3822c38780f30e35815
disco / examples / datamining / naive_linreg.py
100644 83 lines (62 sloc) 1.845 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import disco
 
def estimate_map(e, params):
  x=map(float,e[1].split(' '))
  y=x[params.y_id]
  del x[params.y_id]
 
  return [(e[0],(x,y))]
 
 
def estimate_combiner(k, v, vals, done, params):
  if vals=={}:
    vals['x']=[0.0]*len(v[0])
    vals['x2']=[0.0]*len(v[0])
    vals['xy']=[0.0]*len(v[0])
    vals['y']=0.0
    vals['c']=0
 
  if done:
    return [(k, ' '. join(map(repr,vals['x'] + vals['x2'] + vals['xy'] + [ vals['y'], vals['c'] ])))]
 
  for i in range(len(v)):
    vals['x'][i]+=v[0][i]
    vals['x2'][i]+=v[0][i]*v[0][i]
    vals['xy'][i]+=v[0][i]*v[1]
  vals['y']+=v[1]
  vals['c']+=1
 
 
 
def predict_map(e, params):
  x=map(float,e[1].split(' '))
  return [(e[0],' '.join(map(repr,[params[i][0]+params[i][1]*x[i] for i in range(len(params))])))]
 
 
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader):
  results = disco.job(host, name = 'naive_linear_regression_estimate',
       input_files = input,
       map_reader = map_reader,
       fun_map = estimate_map,
       combiner = estimate_combiner,
       params=disco.Params(y_id=y_id),
       sort = False, clean = False)
 
  c=0
  y=0.0
  l=None
  x=None
  x2=None
  xy=None
 
  for key,value in disco.result_iterator(results):
    v=map(float,value.split(' '))
    
    if l==None:
      l=(len(v)-2)/3
      x=[0.0]*l
      x2=[0.0]*l
      xy=[0.0]*l
 
    c+=v[-1]
    y+=v[-2]
    for i in range(l):
      x[i]+=v[i]
      x2[i]+=v[l+i]
      xy[i]+=v[2*l+i]
 
  b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ]
  a = [ (y-b[i]*x[i])/c for i in range(l) ]
 
  return zip(*(a,b))
 
 
def predict(input, model, host="disco://localhost", map_reader=disco.chain_reader):
  results = disco.job(host, name = 'naive_linear_regression_predict',
       input_files = input,
       map_reader = map_reader,
       fun_map = predict_map,
       params=model,
       sort = False, clean = False)
 
  return results