# Pipeline et modèle

In [161]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import pickle

In [162]:
df = pd.read_csv("../data/AmesHousing2.csv")

In [163]:
df['Total SF'] = df['1st Flr SF'] + df['2nd Flr SF'] + df['Total Bsmt SF']

In [164]:
df['Bath'] = df['Full Bath'] + df['Bsmt Full Bath'] 

In [165]:
# Création des variables d'ancienneté de la maison et des rénovations
df['Age_house'] = df['Yr Sold'] - df['Year Built']


Il ne faut selectionner que 10 features, on supprime Neighborhood qui est trop complexe à traiter, ainsi que bsmt Qual et garage Finish qui sont trop proches d'autres variables

In [166]:
numeric_features = ["Age_house", "Total SF", "Gr Liv Area", "Garage Area", "Overall Qual", "Bath"]
ordinal_features = [ "Exter Qual",  "Kitchen Qual"]
cat_feature = ["Neighborhood"]
all_col = numeric_features.copy()
all_col.extend(ordinal_features)
all_col.extend(cat_feature)

X = df[all_col]
y = df[["SalePrice"]]

X["Total SF"].fillna(1052, inplace=True)
X["Garage Area"].fillna(472, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Total SF"].fillna(1052, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Garage Area"].fillna(472, inplace=True)


In [167]:
X.columns = ['Age_house', 'Total_SF', 'Gr_Liv_Area','Garage_Area', 'Overall_Qual', 'Bath', 'Exter_Qual',
       'Kitchen_Qual', 'Neighborhood']

In [168]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [169]:


exter_cat = [ 'Po', 'Fa','TA', 'Gd','Ex']
kitchen_cat = [ 'Po', 'Fa','TA', 'Gd',"Ex"]

ordinal_transformer = OrdinalEncoder(categories=[exter_cat, kitchen_cat])

categorical_transformer = OneHotEncoder()

In [170]:
from sklearn.preprocessing import StandardScaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())  # Adding StandardScaler() to scale numeric features
])

In [171]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
import numpy as np

# Définition des transformations pour les variables numériques, ordinales et catégorielles
numeric_features = ["Age_house", "Total_SF", "Gr_Liv_Area", "Garage_Area", "Overall_Qual", "Bath"]
ordinal_features = ["Exter_Qual", "Kitchen_Qual"]
cat_feature = ["Neighborhood"]



# Prétraitement des données avec ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, cat_feature),
    ]
)


In [172]:
reg = LinearRegression()

In [173]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('reg', reg)
])

pipe.fit(X_train, y_train) 

In [174]:
pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  20076.616209297794
MAE on test data:  21144.830787762086


In [180]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso,Ridge
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import mean_absolute_error
import numpy as np



# Initialisation du modèle Lasso
Ridge =Ridge()

# Définition des hyperparamètres à tester pour le modèle Lasso
param_grid = {'ridge__alpha': [0.0001,0.0005,0.001,0.005]}

# Initialisation du pipeline avec prétraitement et modèle
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('ridge',Ridge)])

# Recherche du meilleur hyperparamètre alpha avec validation croisée (GridSearchCV)
Ridge_model = GridSearchCV(pipeline, param_grid, cv=5)
Ridge_model.fit(X_train, y_train)

# Affichage des meilleurs paramètres trouvés
print("Meilleurs paramètres trouvés:", Ridge_model.best_params_)

# Prédictions sur les données d'entraînement
y_pred_train = Ridge_model.predict(X_train)

# Calcul de la MAE sur les données d'entraînement
mae_train = mean_absolute_error(y_train, y_pred_train)
print("MAE sur les données d'entraînement:", mae_train)

# Prédictions sur les données de test
y_pred_test = Ridge_model.predict(X_test)

# Calcul de la MAE sur les données de test
mae_test = mean_absolute_error(y_test, y_pred_test)
print("MAE sur les données de test:", mae_test)


Traceback (most recent call last):
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/pipeline.py", line 749, in score
    Xt = transform.transform(Xt)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 816, in transform
    Xs = self._fit_transform(
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 670, 

Meilleurs paramètres trouvés: {'ridge__alpha': 0.0001}
MAE sur les données d'entraînement: 20078.780810039287
MAE sur les données de test: 21151.758082100834




In [176]:
Lasso = Lasso(alpha=0.01)

In [177]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('las', Lasso)
])

pipe.fit(X_train, y_train) 

  model = cd_fast.sparse_enet_coordinate_descent(


In [179]:
pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  20076.616013181065
MAE on test data:  21144.79731355683


In [75]:
filename = '../real_estate_app/main_app/static/models/finalized_model.pkl'
pickle.dump(pipe, open(filename, 'wb'))

In [76]:
dico = {'Age_house': 15, 'Total_SF': 1, 'Gr_Liv_Area': 1, 'Garage_Area': 1, 'Overall_Qual': 1, 'Bath': 1, 'Exter_Qual': 'Po', 'Kitchen_Qual': 'Po', 'Neighborhood': 'Gilbert'}
