In [24]:
%reset -f
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from math import sqrt


In [26]:

rentdata = pd.read_csv('version3.csv')
y=rentdata['price'].values
rentdata=rentdata.drop(['price','Unnamed: 0'],axis=1)
rentdata.head()

Unnamed: 0,rooms,bathrooms,exterior,hasLift,size,floor,longitude,latitude,hasParkingSpace,isParkingSpaceIncludedInPrice,chalet,duplex,flat,penthouse,studio,good,newdevelopment,renew
0,4.0,2.0,1.0,1.0,125.0,6.0,2.216528,41.408302,1.0,1.0,0,0,1,0,0,1,0,0
1,3.0,2.0,1.0,1.0,107.0,6.0,2.211622,41.406239,0.0,0.0,0,0,1,0,0,1,0,0
2,4.0,2.0,1.0,1.0,130.0,1.0,2.177979,41.384283,0.0,0.0,0,0,1,0,0,1,0,0
3,2.0,1.0,0.0,1.0,70.0,2.0,2.165862,41.397875,0.0,0.0,0,0,1,0,0,1,0,0
4,3.0,1.0,1.0,1.0,75.0,2.0,2.13782,41.35947,0.0,0.0,0,0,1,0,0,1,0,0


We define the feature we want to predict (price), ant split our training and testing sets.

In [27]:
X=rentdata.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=44)



In [28]:
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [29]:
print X_train.shape
print X_test.shape


(4601L, 18L)
(512L, 18L)


In [30]:
#Creates a vector (x,x^2,x^3,....,x^i)
def extend(x,n):
    x_extend=x
    for i in range(1,n): x_extend=np.c_[x_extend,x**(i+1)]
    return x_extend

In [31]:
#Root mean squared error
def rmse(y,yhat):
    dif=y-yhat
    N=len(y)
    return sqrt((1./N)*np.dot(dif.T,dif))



In [32]:
CMAX=20
Err=np.zeros((CMAX,4,2))
reg_meth=['Linear Regression','Ridge regressor','Random Forest Regressor','Gradient Boosting Regressor']


In [33]:
#Linear Regressor
lreg = LinearRegression(fit_intercept=True,normalize=False)
for i in range(CMAX):
    lreg.fit(extend(X_train,i+1),y_train)
    yhat_in=lreg.predict(extend(X_train,i+1))
    Err[i,0,0]=rmse(y_train,yhat_in)
    yhat_out=lreg.predict(extend(X_test,i+1))
    Err[i,0,1]=rmse(y_test,yhat_out)

In [35]:
#Ridge Regressor
rid = Ridge(normalize= False, alpha= 0.7, tol= 0.1, fit_intercept= True)
for i in range(CMAX):
    rid.fit(extend(X_train,i+1), y_train)
    yhat_in=rid.predict(extend(X_train,i+1))
    Err[i,1,0]=rmse(y_train,yhat_in)
    yhat_out=rid.predict(extend(X_test,i+1))
    Err[i,1,1]=rmse(y_test,yhat_out)

In [36]:
#Random Forest Regressor
rfr = RandomForestRegressor(n_estimators=500,bootstrap=True)
rfr.fit(X_train, y_train)
yhat_in=rfr.predict(X_train)
yhat_out=rfr.predict(X_test)
for i in range(CMAX):
    Err[i,2,0]=rmse(y_train,yhat_in)
    Err[i,2,1]=rmse(y_test,yhat_out)

In [37]:
#Gradient Boosting Regressor
gbr=GradientBoostingRegressor(n_estimators=200, max_depth=5)
gbr.fit(X_train, y_train)
yhat_in=gbr.predict(X_train)
yhat_out=gbr.predict(X_test)
for i in range(CMAX):
   Err[i,3,0]=rmse(y_train,yhat_in)
   Err[i,3,1]=rmse(y_test,yhat_out)
    
    

In [38]:
Ein_min=Err[:,:,0].min()
Eout_min=Err[:,:,1].min()
idx_inmin=np.asarray([np.where(Err[:,:,0]==Ein_min)[0][0],np.where(Err[:,:,0]==Ein_min)[1][0]])
idx_outmin=np.asarray([np.where(Err[:,:,1]==Eout_min)[0][0],np.where(Err[:,:,1]==Eout_min)[1][0]])
print Ein_min,Eout_min
print idx_inmin
print idx_outmin

168.975267613 439.784296716
[0 2]
[0 2]


In [41]:
print '     \t Linear Regressor \t\t         Ridge Regressor '
print 'C    \t  Ein            Eout     \t       Ein           Eout '
for i in range(CMAX):
    print  i+1,'  ',Err[i,0,0],' ',Err[i,0,1],'\t ',Err[i,1,0],' ',Err[i,1,1]
print '\n\n     \t  Random Forest Regressor \t Gradient Boosting Regressor'
print '     \t  Ein         Eout            \t  Ein           Eout   '
print '   ',Err[i,2,0],' ',Err[i,2,1],' ',Err[i,3,0],' ',Err[i,3,1]

print '\n\nThe minimum Ein is',Ein_min,'for',reg_meth[idx_inmin[1]],',complexity',idx_inmin[0]+1
print '\n\nThe minimum Eout is',Eout_min,'for',reg_meth[idx_outmin[1]],',complexity',idx_outmin[0]+1  

     	 Linear Regressor 		         Ridge Regressor 
C    	  Ein            Eout     	       Ein           Eout 
1    628.298128808   618.733277508 	  628.298134883   618.730411672
2    548.5669234   554.686122787 	  548.567712622   554.729736094
3    544.013404279   548.650328058 	  544.015076935   548.701035419
4    536.780239295   544.69165192 	  536.781530353   544.750452429
5    528.677779334   536.241980801 	  528.692091924   536.277080132
6    527.017227449   536.803116002 	  527.068618198   536.766380773
7    524.878698252   536.590965465 	  524.959804778   536.653274905
8    525.334325323   536.348963999 	  524.163208768   537.702922709
9    542.090265887   551.654342095 	  556.286114249   564.325665582
10    905.460752849   838.256690042 	  3041.37586193   2489.22407104
11    1943.79601778   1617.50545004 	  849.982050866   848.616815403
12    830.259071488   768.436822217 	  840.823510335   777.242999803
13    781.697522187   776.026894746 	  899.705105853   777.732936126
14 

In [42]:
X_scaled=scaler.fit_transform(X)
if idx_outmin[1]==0: freg = lreg
if idx_outmin[1]==1: freg = rid 
if idx_outmin[1]==2: freg = rfr 
if idx_outmin[1]==3: freg = gbr
i=idx_outmin[0]
freg.fit(extend(X_scaled,i+1),y)
yhat_in=freg.predict(extend(X_scaled,i+1))
Ein=rmse(y,yhat_in)
#yhat_out=freg.predict()

print 'Ein=',Ein#,'\t Eout=',Eout

Ein= 167.009556663
