import disco
def estimate_map(e, params):
x=map(float,e[1].split(' '))
y=x[params.y_id]
del x[params.y_id]
return [(e[0],(x,y))]
def estimate_combiner(k, v, vals, done, params):
if vals=={}:
vals['x']=[0.0]*len(v[0])
vals['x2']=[0.0]*len(v[0])
vals['xy']=[0.0]*len(v[0])
vals['y']=0.0
vals['c']=0
if done:
return [(k, ' '. join(map(repr,vals['x'] + vals['x2'] + vals['xy'] + [ vals['y'], vals['c'] ])))]
for i in range(len(v)):
vals['x'][i]+=v[0][i]
vals['x2'][i]+=v[0][i]*v[0][i]
vals['xy'][i]+=v[0][i]*v[1]
vals['y']+=v[1]
vals['c']+=1
def predict_map(e, params):
x=map(float,e[1].split(' '))
return [(e[0],' '.join(map(repr,[params[i][0]+params[i][1]*x[i] for i in range(len(params))])))]
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader):
results = disco.job(host, name = 'naive_linear_regression_estimate',
input_files = input,
map_reader = map_reader,
fun_map = estimate_map,
combiner = estimate_combiner,
params=disco.Params(y_id=y_id),
sort = False, clean = False)
c=0
y=0.0
l=None
x=None
x2=None
xy=None
for key,value in disco.result_iterator(results):
v=map(float,value.split(' '))
if l==None:
l=(len(v)-2)/3
x=[0.0]*l
x2=[0.0]*l
xy=[0.0]*l
c+=v[-1]
y+=v[-2]
for i in range(l):
x[i]+=v[i]
x2[i]+=v[l+i]
xy[i]+=v[2*l+i]
b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ]
a = [ (y-b[i]*x[i])/c for i in range(l) ]
return zip(*(a,b))
def predict(input, model, host="disco://localhost", map_reader=disco.chain_reader):
results = disco.job(host, name = 'naive_linear_regression_predict',
input_files = input,
map_reader = map_reader,
fun_map = predict_map,
params=model,
sort = False, clean = False)
return results