In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LinearRegression

# 1ère itération / Baseline

In [26]:
df_fillna = pd.read_pickle("data/EDA_iter_1.pkl")
df_fillna

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


## Hold out

In [21]:
# Instanciate the model
model = LinearRegression()

# Define X and y
X = df_fillna.drop(columns = ["median_house_value", "ocean_proximity"])
y = df_fillna["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

# Train the model on the data
model.fit(X_train, y_train)

# Evaluate the model with the test set
model.score(X_test, y_test)

0.631187139333667

## Cross validation

In [23]:
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.6348249131619182

# 2ème itération / imputed value

In [29]:
df_imputed = pd.read_pickle("data/EDA_iter_2_imputed.pkl")
df_imputed

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0
...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0


In [43]:
X = df_imputed.drop(columns = "median_house_value")
y = df_imputed["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.6359336753375091

# 3ème itération / Imputation iterative

In [44]:
df_iter_imputed = pd.read_pickle("data/EDA_iter_3_iter_imputed.pkl")
df_iter_imputed

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0
...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0


In [47]:
X = df_iter_imputed.drop(columns = "median_house_value")
y = df_iter_imputed["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.6374647793620855

# 4ème itération / sans outliers

In [49]:
df_without_outliers= pd.read_pickle("data/EDA_iter_4_without_outliers.pkl")
df_without_outliers 

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,2072.0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0
1,10600.0,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0
2,2494.0,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0
3,4284.0,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0
4,16541.0,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0
...,...,...,...,...,...,...,...,...,...,...
16507,1099.0,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0
16508,18898.0,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0
16509,11798.0,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0
16510,6637.0,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0


In [50]:
X = df_without_outliers .drop(columns = "median_house_value")
y = df_without_outliers ["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.633485278232603

# 5ème itération / Normalisé

In [51]:
df_norm = pd.read_pickle("data/EDA_iter_5_norm.pkl")
df_norm

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-1.384010,-0.137635,0.534564,-1.795939,-0.357368,-0.156346,-0.032827,-0.218173,-1.258403,-1.167387
1,0.051247,0.879836,-0.909979,-1.637178,-0.276515,-0.448073,-0.494784,-0.417841,1.610623,0.586611
2,-1.312987,-0.312201,0.455091,-0.287715,-0.799127,-0.765679,-0.435204,-0.744572,-1.220425,-1.286288
3,-1.011731,0.620480,-0.713633,0.188566,-0.913406,-0.730389,-0.723603,-0.710862,-1.233736,-0.056493
4,1.051114,-0.830911,1.011403,-0.605236,-0.326201,-0.391610,-0.263373,-0.334862,0.114837,-0.769028
...,...,...,...,...,...,...,...,...,...,...
16507,-1.547765,-1.165080,1.852884,-0.684616,-0.532626,-0.615110,-0.595810,-0.651220,-0.423409,-0.978188
16508,1.447795,-1.339646,1.161000,1.617410,-0.126100,-0.083414,-0.262510,-0.111856,-0.660681,-0.897475
16509,0.252870,-0.825923,1.539666,-0.763996,-0.051119,-0.184578,-0.197750,-0.187056,0.333675,-0.120716
16510,-0.615723,0.710257,-0.685584,0.823608,0.059547,0.707071,0.959301,0.704970,-0.680911,-0.460060


In [52]:
X = df_norm.drop(columns = "median_house_value")
y = df_norm["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.6374647793620909

# 6ème itération / Standardisé

In [53]:
df_minmax = pd.read_pickle("data/EDA_iter_6_minmax.pkl")
df_minmax

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0.100392,0.449203,0.449522,0.098039,0.046981,0.074995,0.039071,0.068257,0.067709,0.117527
1,0.513591,0.652390,0.121148,0.137255,0.051534,0.055789,0.024076,0.055592,0.442297,0.534227
2,0.120839,0.414343,0.431456,0.470588,0.022104,0.034879,0.026010,0.034868,0.072668,0.089280
3,0.207568,0.600598,0.165781,0.588235,0.015669,0.037202,0.016648,0.037007,0.070930,0.381444
4,0.801444,0.310757,0.557917,0.392157,0.048736,0.059506,0.031587,0.060855,0.247003,0.212166
...,...,...,...,...,...,...,...,...,...,...
16507,0.053249,0.244024,0.749203,0.372549,0.037111,0.044792,0.020797,0.040789,0.176729,0.162476
16508,0.915645,0.209163,0.591923,0.941176,0.060004,0.079797,0.031615,0.075000,0.145750,0.181651
16509,0.571636,0.311753,0.678002,0.352941,0.064226,0.073137,0.033717,0.070230,0.275576,0.366186
16510,0.321576,0.618526,0.172157,0.745098,0.070458,0.131840,0.071274,0.126809,0.143108,0.285568


In [54]:
X = df_minmax.drop(columns = "median_house_value")
y = df_minmax["median_house_value"]
cv_results = cross_validate(model, X, y, cv=5)

# obtain the mean of scores
cv_results['test_score'].mean()

0.6374647793620911

# 7ème itération / 