In [1]:
# import des librairies de bases
import pandas as pd
import numpy as np



# import des librairies de préprocessing
from sklearn.preprocessing import RobustScaler


# import des librairies machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Importation des librairies nécessaires
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import pickle

In [2]:
df = pd.read_csv('immo_df.csv')

In [3]:
df.isna().sum()

Unnamed: 0              0
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        176
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
df = df.drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,-117.8,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,-120.19,36.6,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,-118.32,34.1,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND


In [5]:
df = df.dropna()

In [6]:
df = pd.get_dummies(df)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,0,1,0,0,0
1,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,1,0,0,0,0
2,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,0,1,0,0,0
3,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,1,0,0,0,0
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,0,1,0,0,0
16508,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,0,0,0,1,0
16509,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,0,1,0,0,0
16510,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,1,0,0,0,0


In [7]:
# séparation des features et de la target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]


In [8]:
list(X.columns)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity_<1H OCEAN',
 'ocean_proximity_INLAND',
 'ocean_proximity_ISLAND',
 'ocean_proximity_NEAR BAY',
 'ocean_proximity_NEAR OCEAN']

In [9]:
X['housing_median_age'] = pd.DataFrame(np.log(abs(X['housing_median_age'])))
X['total_rooms'] = pd.DataFrame(np.log(abs(X['total_rooms'])))
X['total_bedrooms'] = pd.DataFrame(np.log(abs(X['total_bedrooms'])))
X['population'] = pd.DataFrame(np.log(abs(X['population'])))
X['households'] = pd.DataFrame(np.log(abs(X['households'])))
X['median_income'] = pd.DataFrame(np.log(abs(X['median_income'])))

In [10]:
y =  pd.DataFrame(np.log(abs(y)))

In [11]:
# création du jeu de train et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [12]:
# création d'un pipeline de normalisation et du choix du modèle
pipeline = Pipeline([
    ('regression', LinearRegression())
])

In [13]:
# Entraîner le modèle sur les données de formation
pipeline.fit(X_train, y_train)

In [14]:
# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

In [15]:
# Calculer les metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


# Imprimer les scores
print("MAE :", mae)
print("R2 score :", r2)
print("MSE :", mse)
print("RMSE :", rmse)

MAE : 0.2382253559974057
R2 score : 0.7019796191119421
MSE : 0.09917442738047878
RMSE : 0.3149197157697161


In [16]:
# Effectuer la validation croisée
scores = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print("MAE moyen pour la validation croisée :", -scores.mean())

MAE moyen pour la validation croisée : 0.23603539997678497


In [18]:
# Enregistrer le modèle dans un fichier pickle
pickle.dump(pipeline, open("model_log.pkl", "wb"))

## model 2 avec robust scaler

In [19]:
# séparation des features et de la target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# création du jeu de train et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [20]:
# création du jeu de train et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [21]:
# création d'un pipeline de normalisation et du choix du modèle
pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('regression', LinearRegression())
])

In [22]:
# Entraîner le modèle sur les données de formation
pipeline.fit(X_train, y_train)

In [23]:
# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

In [24]:
# Calculer les metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


# Imprimer les scores
print("MAE :", mae)
print("R2 score :", r2)
print("MSE :", mse)
print("RMSE :", rmse)

MAE : 50523.82321189527
R2 score : 0.6510124588022759
MSE : 4784663574.560258
RMSE : 69171.26263528994


In [25]:
# Effectuer la validation croisée
scores = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print("MAE moyen pour la validation croisée :", -scores.mean())

MAE moyen pour la validation croisée : 49526.23555808291


In [26]:
# Enregistrer le modèle dans un fichier pickle
pickle.dump(pipeline, open("model_robust.pkl", "wb"))