In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LinearRegression

from sklearn import linear_model
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# 1ère itération / Baseline

In [126]:
df_fillna = pd.read_pickle("data/EDA_iter_1.pkl")
df_fillna

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


## Hold out

In [127]:
# Instanciate the model
model = LinearRegression()

# Define X and y
Xb = df_fillna.drop(columns = ["median_house_value", "ocean_proximity"])
yb = df_fillna["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(Xb, yb, test_size=.3, random_state=1)

# Train the model on the data
model.fit(X_train, y_train)

# Evaluate the model with the test set
model.score(X_test, y_test)

0.631187139333667

## Cross validation

In [128]:
cv_results = cross_validate(model, Xb, yb, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
base_r2 = cv_results['test_r2'].mean()
base_rmse = cv_results['test_neg_root_mean_squared_error'].mean()
print(base_r2)
print(base_rmse)

0.6348249131619182
-69612.35723615656


# 2ème itération / imputed value

In [129]:
df_imputed = pd.read_pickle("data/EDA_iter_2_imputed.pkl")
df_imputed

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815
...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750


In [130]:
X = df_imputed
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6359321320105711
-69506.39456119007
difference entre base r2 et nouveau r2 : 0.0011072188486528356
difference entre base rmse et nouveau rmse :  105.96267496648943


# 3ème itération / Imputation iterative

In [131]:
df_iter_imputed = pd.read_pickle("data/EDA_iter_3_iter_imputed.pkl")
df_iter_imputed

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815
...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750


In [132]:
X = df_iter_imputed
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6373855122947356
-69367.37001772346
difference entre base r2 et nouveau r2 : 0.002560599132817365
difference entre base rmse et nouveau rmse :  244.9872184331034


# 4ème itération / sans outliers

In [133]:
df_without_outliers= pd.read_pickle("data/EDA_iter_4_without_outliers.pkl")
df_without_outliers 

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0
...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0


In [134]:
X = df_without_outliers.drop(columns = 'median_house_value')
y = df_without_outliers["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6333230175263589
-66867.16507778953
difference entre base r2 et nouveau r2 : -0.0015018956355593582
difference entre base rmse et nouveau rmse :  2745.1921583670337


# 5ème itération / Normalisé

In [135]:
df_norm = pd.read_pickle("data/EDA_iter_5_norm.pkl")
df_norm

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-1.384010,-0.137635,0.534564,-1.795939,-0.357368,-0.156340,-0.032827,-0.218173,-1.258403
1,0.051247,0.879836,-0.909979,-1.637178,-0.276515,-0.448069,-0.494784,-0.417841,1.610623
2,-1.312987,-0.312201,0.455091,-0.287715,-0.799127,-0.765677,-0.435204,-0.744572,-1.220425
3,-1.011731,0.620480,-0.713633,0.188566,-0.913406,-0.730387,-0.723603,-0.710862,-1.233736
4,1.051114,-0.830911,1.011403,-0.605236,-0.326201,-0.391605,-0.263373,-0.334862,0.114837
...,...,...,...,...,...,...,...,...,...
16507,-1.547765,-1.165080,1.852884,-0.684616,-0.532626,-0.615107,-0.595810,-0.651220,-0.423409
16508,1.447795,-1.339646,1.161000,1.617410,-0.126100,-0.083408,-0.262510,-0.111856,-0.660681
16509,0.252870,-0.825923,1.539666,-0.763996,-0.051119,-0.184572,-0.197750,-0.187056,0.333675
16510,-0.615723,0.710257,-0.685584,0.823608,0.059547,0.707083,0.959301,0.704970,-0.680911


In [136]:
X = df_norm
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6373855122947327
-69367.37001772373
difference entre base r2 et nouveau r2 : 0.0025605991328144784
difference entre base rmse et nouveau rmse :  244.98721843282692


# 6ème itération / Standardisé

In [137]:
df_minmax = pd.read_pickle("data/EDA_iter_6_minmax.pkl")
df_minmax

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-1.384010,-0.137635,0.534564,-1.795939,-0.357368,-0.156340,-0.032827,-0.218173,-1.258403
1,0.051247,0.879836,-0.909979,-1.637178,-0.276515,-0.448069,-0.494784,-0.417841,1.610623
2,-1.312987,-0.312201,0.455091,-0.287715,-0.799127,-0.765677,-0.435204,-0.744572,-1.220425
3,-1.011731,0.620480,-0.713633,0.188566,-0.913406,-0.730387,-0.723603,-0.710862,-1.233736
4,1.051114,-0.830911,1.011403,-0.605236,-0.326201,-0.391605,-0.263373,-0.334862,0.114837
...,...,...,...,...,...,...,...,...,...
16507,-1.547765,-1.165080,1.852884,-0.684616,-0.532626,-0.615107,-0.595810,-0.651220,-0.423409
16508,1.447795,-1.339646,1.161000,1.617410,-0.126100,-0.083408,-0.262510,-0.111856,-0.660681
16509,0.252870,-0.825923,1.539666,-0.763996,-0.051119,-0.184572,-0.197750,-0.187056,0.333675
16510,-0.615723,0.710257,-0.685584,0.823608,0.059547,0.707083,0.959301,0.704970,-0.680911


In [138]:
X = df_minmax
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5,scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6373855122947327
-69367.37001772373
difference entre base r2 et nouveau r2 : 0.0025605991328144784
difference entre base rmse et nouveau rmse :  244.98721843282692


# 7ème itération / dummy encoding

In [139]:
df_dummy_encoding = pd.read_pickle("data/EDA_iter_7_dummy_encoding.pkl")
df_dummy_encoding

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,1
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,1
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,1
...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,1
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,3
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,1
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,0


In [140]:
X = df_dummy_encoding
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.637451662777517
-69361.31419801382
difference entre base r2 et nouveau r2 : 0.0026267496155987224
difference entre base rmse et nouveau rmse :  251.04303814274317


# 8ème itération / OneHot encoding

In [141]:
df_encoding = pd.read_pickle("data/EDA_iter_8_onehot_encoding.pkl")
df_encoding

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,0.0,1.0,0.0,0.0,0.0
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,1.0,0.0,0.0,0.0,0.0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,0.0,1.0,0.0,0.0,0.0
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,1.0,0.0,0.0,0.0,0.0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,0.0,1.0,0.0,0.0,0.0
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,0.0,0.0,0.0,1.0,0.0
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,0.0,1.0,0.0,0.0,0.0
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,1.0,0.0,0.0,0.0,0.0


In [142]:
X = df_encoding
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6476319279706088
-68378.66601750783
difference entre base r2 et nouveau r2 : 0.012807014808690509
difference entre base rmse et nouveau rmse :  1233.6912186487316


In [157]:
X = df_encoding.drop(columns = ["ocean_proximity_NEAR BAY","ocean_proximity_NEAR OCEAN","ocean_proximity_<1H OCEAN"])
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6478021284776391
-68361.99098762078
difference entre base r2 et nouveau r2 : 0.012977215315720825
difference entre base rmse et nouveau rmse :  1250.3662485357781


# 9ème itération / Scale part distribution

In [144]:
df_scale = pd.read_pickle("data/EDA_iter_9_scale.pkl")
df_scale

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.100392,0.449203,0.449522,-1.210526,7.525101,6.161207,7.242798,6.035481,0.908944,0.0,1.0,0.0,0.0,0.0
1,0.513591,0.652390,0.121148,-1.105263,7.617268,5.857933,6.760415,5.831882,2.068545,1.0,0.0,0.0,0.0,0.0
2,0.120839,0.414343,0.431456,-0.210526,6.775366,5.370638,6.837333,5.370638,0.937504,0.0,1.0,0.0,0.0,0.0
3,0.207568,0.600598,0.165781,0.105263,6.434547,5.438079,6.393591,5.429346,0.927587,1.0,0.0,0.0,0.0,0.0
4,0.801444,0.310757,0.557917,-0.421053,7.561642,5.924256,7.030857,5.921578,1.625606,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.053249,0.244024,0.749203,-0.473684,7.290293,5.631212,6.614726,5.525453,1.401799,0.0,1.0,0.0,0.0,0.0
16508,0.915645,0.209163,0.591923,1.052632,7.768956,6.224558,7.031741,6.129050,1.284621,0.0,0.0,0.0,1.0,0.0
16509,0.571636,0.311753,0.678002,-0.526316,7.836765,6.135565,7.095893,6.063785,1.703984,0.0,1.0,0.0,0.0,0.0
16510,0.321576,0.618526,0.172157,0.526316,7.929126,6.734592,7.842671,6.651572,1.273965,1.0,0.0,0.0,0.0,0.0


In [148]:
X = df_scale
y = df_fillna["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse )

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.6408023425931619
-69031.5301724872
difference entre base r2 et nouveau r2 : 0.005977429431243686
difference entre base rmse et nouveau rmse :  580.8270636693633


In [None]:
#%%time
#import itertools
  
#df_scale.columns = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
#best_model = 0
#for i in range(1,15):
#    for x in itertools.combinations(df_scale.columns,i):
#            X = df_scale[list(x)].to_numpy()
#            cv_results = cross_validate(model, X, y, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))
#            result = cv_results['test_r2'].mean()
#            if result > best_model :
#                best_model = result
#                print(x)
#                print(best_model)

# 10ème itération / Random Forest

Après avoir essayé le scaling, imputation de valeur, enlevé les outliers, encoder etc.. Le score ne variant que très peu on peux en conclure que nous utilisons pas le bon modèle.
Nous allons donc ici essayer un random forest

In [199]:

rfr = RandomForestRegressor(n_estimators=100,random_state = 0, max_features = 'auto')
cv_results = cross_validate(rfr, Xb, yb, cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse)

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.8258194341138548
-48058.22618744958
difference entre base r2 et nouveau r2 : 0.19099452095193659
difference entre base rmse et nouveau rmse :  21554.13104870698


# 11ème itération / Random Forest + feature eng

In [219]:
df_fe = pd.read_pickle("data/EDA_iter_11_feature_engi.pkl")
df_fe

Unnamed: 0,id,longitude,latitude,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,...,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,ratio_total_rooms/population,"group_house_age_0,10","group_house_age_10,20","group_house_age_20,30","group_house_age_30,40","group_house_age_40,50","group_house_age_50,60"
0,2072.0,-119.84,36.77,1853.0,473.0,1397.0,417.0,1.4817,72000.0,0.0,...,0.0,0.0,0.0,1.326414,1.0,0.0,0.0,0.0,0.0,0.0
1,10600.0,-117.80,33.68,2032.0,349.0,862.0,340.0,6.9133,274100.0,1.0,...,0.0,0.0,0.0,2.357309,1.0,0.0,0.0,0.0,0.0,0.0
2,2494.0,-120.19,36.60,875.0,214.0,931.0,214.0,1.5536,58300.0,0.0,...,0.0,0.0,0.0,0.939850,0.0,0.0,1.0,0.0,0.0,0.0
3,4284.0,-118.32,34.10,622.0,229.0,597.0,227.0,1.5284,200000.0,1.0,...,0.0,0.0,0.0,1.041876,0.0,0.0,0.0,1.0,0.0,0.0
4,16541.0,-121.23,37.79,1922.0,373.0,1130.0,372.0,4.0815,117900.0,0.0,...,0.0,0.0,0.0,1.700885,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,1465.0,278.0,745.0,250.0,3.0625,93800.0,0.0,...,0.0,0.0,0.0,1.966443,0.0,1.0,0.0,0.0,0.0,0.0
16508,18898.0,-122.25,38.11,2365.0,504.0,1131.0,458.0,2.6133,103100.0,0.0,...,0.0,1.0,0.0,2.091070,0.0,0.0,0.0,0.0,1.0,0.0
16509,11798.0,-121.22,38.92,2531.0,461.0,1206.0,429.0,4.4958,192600.0,0.0,...,0.0,0.0,0.0,2.098673,0.0,1.0,0.0,0.0,0.0,0.0
16510,6637.0,-118.14,34.16,2776.0,840.0,2546.0,773.0,2.5750,153500.0,1.0,...,0.0,0.0,0.0,1.090338,0.0,0.0,0.0,1.0,0.0,0.0


In [220]:
rfr = RandomForestRegressor(n_estimators=50,random_state = 0, max_features = 'auto')
cv_results = cross_validate(rfr, df_fe.drop(columns = ['median_house_value']), df_fe["median_house_value"], cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse)

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.830531640089899
-47406.79693746778
difference entre base r2 et nouveau r2 : 0.19570672692798075
difference entre base rmse et nouveau rmse :  22205.560298688782


In [222]:
final_model = rfr.fit(df_fe.drop(columns = ['median_house_value']), df_fe["median_house_value"])

In [225]:
import pickle

outfile = open("data/final_model",'wb')
pickle.dump(final_model,outfile)
outfile.close()

# 12ème itération / vérification de la 1ère inférence

In [227]:
rfr = RandomForestRegressor(n_estimators=50,random_state = 0, max_features = 'auto')
cv_results = cross_validate(rfr, df_fe.drop(columns = ['median_house_value','group_house_age_0,10','group_house_age_20,30',
                                                      'group_house_age_30,40','group_house_age_40,50']), df_fe["median_house_value"], cv=5, scoring=('r2', 'neg_root_mean_squared_error','neg_mean_absolute_error'))

# obtain the mean of scores
print('base r2 :', base_r2, '\n','base_rmse : ', base_rmse)
print(cv_results['test_r2'].mean())
print(cv_results['test_neg_root_mean_squared_error'].mean())
print('difference entre base r2 et nouveau r2 :', cv_results['test_r2'].mean()-base_r2)
print('difference entre base rmse et nouveau rmse : ',cv_results['test_neg_root_mean_squared_error'].mean()-base_rmse)

base r2 : 0.6348249131619182 
 base_rmse :  -69612.35723615656
0.8304093008945095
-47427.397543672974
difference entre base r2 et nouveau r2 : 0.19558438773259124
difference entre base rmse et nouveau rmse :  22184.959692483586
