In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

%matplotlib inline 

In [90]:
#TBD
encodedf = pd.read_csv('df_clean.csv')
encodedf.head()

Unnamed: 0,price,long,lat,year,sqft_above,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,...,zipcode_98199,HouseAgeGroup_1,HouseAgeGroup_2,HouseAgeGroup_3,HouseAgeGroup_4,view_0.0,view_1.0,view_2.0,view_3.0,view_4.0
0,221900.0,-122.257,47.5112,2014,1180,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
1,538000.0,-122.319,47.721,2014,2170,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
2,180000.0,-122.233,47.7379,2015,770,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,604000.0,-122.393,47.5208,2014,1050,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
4,510000.0,-122.045,47.6168,2015,1680,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [91]:
X = encodedf.loc[:, encodedf.columns != 'price']
y = encodedf.price

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((17276, 140), (17276,), (4320, 140), (4320,))

In [93]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()

In [94]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [95]:
model = MLPRegressor(solver='lbfgs', alpha=0.0001, max_iter=10000, random_state=0, max_fun=15000)
hyperparameters = {'hidden_layer_sizes': [ (1,),(3,),(5,), (7,)], 'activation': ['logistic', 'tanh', 'relu']}

In [96]:
#TBD
gridSearchCV = GridSearchCV(model,
                            param_grid= hyperparameters,
                            refit=True,
                            cv=4,
                            return_train_score=True, verbose=1)

gridSearchCV.fit(scaled_X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


GridSearchCV(cv=4,
             estimator=MLPRegressor(max_iter=10000, random_state=0,
                                    solver='lbfgs'),
             param_grid={'activation': ['logistic', 'tanh', 'relu'],
                         'hidden_layer_sizes': [(1,), (3,), (5,), (7,)]},
             return_train_score=True, verbose=1)

In [97]:
gridSearchCV.best_params_

{'activation': 'relu', 'hidden_layer_sizes': (3,)}

In [98]:
pd.DataFrame(gridSearchCV.cv_results_).sort_values(by=['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
9,7.428738,2.656435,0.002545,0.000556,relu,"(3,)","{'activation': 'relu', 'hidden_layer_sizes': (...",0.827368,0.833357,0.782279,0.850564,0.823392,0.025217,1,0.8709018,0.881304,0.8841168,0.8784075,0.878683,0.004925
11,9.736032,0.869277,0.002485,0.000507,relu,"(7,)","{'activation': 'relu', 'hidden_layer_sizes': (...",0.812372,0.838904,0.761004,0.84939,0.815417,0.03419,2,0.887833,0.891329,0.8948919,0.8947827,0.892209,0.002905
10,8.175372,3.458519,0.005017,0.003315,relu,"(5,)","{'activation': 'relu', 'hidden_layer_sizes': (...",0.793565,0.791858,0.763978,0.807612,0.789253,0.015821,3,0.8186533,0.830391,0.8341754,0.8255963,0.827204,0.005798
6,2.080823,2.113424,0.002991,0.000705,tanh,"(5,)","{'activation': 'tanh', 'hidden_layer_sizes': (...",0.086045,0.008056,0.083432,0.238957,0.104123,0.083911,4,0.09239123,0.007573,0.08454302,0.2097404,0.098562,0.07224
2,12.538935,20.818177,0.003739,0.000827,logistic,"(5,)","{'activation': 'logistic', 'hidden_layer_sizes...",-0.000181,0.41255,-0.000203,-3.7e-05,0.103032,0.1787,5,-1.503242e-13,0.419649,-2.220446e-16,-4.03011e-12,0.104912,0.181713


In [99]:
y_predict_train = gridSearchCV.best_estimator_.predict(scaled_X_train)


In [100]:
MSE_train = mean_squared_error(y_train, y_predict_train)

In [101]:
print("L'erreur quadratique moyenne vaut :", MSE_train.round(2))

L'erreur quadratique moyenne vaut : 16757445358.11


In [102]:
RMSE_train = mean_squared_error(y_train, y_predict_train, squared=False)

In [103]:
print("La racine carrée de l'erreur quadratique moyenne vaut :", RMSE_train.round(2))

La racine carrée de l'erreur quadratique moyenne vaut : 129450.55


In [104]:
R_squared_train = 1.0 - (np.sum((y_train - y_predict_train)**2) / np.sum((y_train - np.mean(y_train))**2))

In [105]:
print("Le coefficient de détermination vaut :", R_squared_train.round(4))

Le coefficient de détermination vaut : 0.8767


## Evaluation des performances du modèle sur le jeu de test

In [116]:
y_predict_test = gridSearchCV.best_estimator_.predict(scaled_X_test)

In [117]:
MSE_test = mean_squared_error(y_test, y_predict_test)

In [118]:
print("L'erreur quadratique moyenne vaut :", MSE_test.round(2))

L'erreur quadratique moyenne vaut : 20422019814.04


In [119]:
RMSE_test = mean_squared_error(y_test, y_predict_test, squared=False)

In [120]:
print("La racine carrée de l'erreur quadratique moyenne vaut :", RMSE_test.round(2))

La racine carrée de l'erreur quadratique moyenne vaut : 142905.63


In [121]:
R_squared_test = 1.0 - (np.sum((y_test - y_predict_test)**2) / np.sum((y_test - np.mean(y_test))**2))

In [122]:
print("Le coefficient de détermination vaut :", R_squared_test.round(4))

Le coefficient de détermination vaut : 0.8441


# New data prediction 

# Choix du modèle pour déploiement

In [124]:
import pickle

In [126]:
pickle.dump(y_predict_test, open('MLPregressorModel.pkl', 'wb'))

## D'autres modèles 

In [106]:

from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.linear_model import LassoCV

In [107]:
reg = LassoCV(cv=5, random_state=0).fit(scaled_X_train, y_train)
print(reg.score(scaled_X_test, y_test))

0.7853162367026174


# -----------------------------------------------------------------------

In [108]:
from sklearn.linear_model import ElasticNetCV

In [109]:
regr = ElasticNetCV(cv=5, random_state=0)
regr.fit(scaled_X_train, y_train)

ElasticNetCV(cv=5, random_state=0)

In [110]:
regr.score(scaled_X_test, y_test)

0.004779817871759295

# -----------------------------------------------------------------------

In [111]:
from sklearn.linear_model import PassiveAggressiveRegressor

In [112]:
regr = PassiveAggressiveRegressor(max_iter=100, random_state=0, tol=1e-3)
regr.fit(scaled_X_train, y_train)



PassiveAggressiveRegressor(max_iter=100, random_state=0)

In [113]:
regr.score(scaled_X_test, y_test)

0.2807279736952193

# -----------------------------------------------------------------------

In [114]:
import xgboost as xgb

In [115]:
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],'n_estimators': [50, 100, 200]}, verbose=1, n_jobs=2)
clf.fit(scaled_X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
0.860926083155135
{'max_depth': 6, 'n_estimators': 100}


## Evaluation des performances du modèle sur le jeu de test