# Pipeline et modèle

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import pickle

In [2]:
df = pd.read_csv("../data/AmesHousing2.csv")

Il ne faut selectionner que 10 features, on supprime Neighborhood qui est trop complexe à traiter, ainsi que bsmt Qual et garage Finish qui sont trop proches d'autres variables

In [7]:
df['Total SF'] = df['1st Flr SF'] + df['2nd Flr SF'] + df['Total Bsmt SF']


In [19]:
numeric_features = ["Year Built", 'Year_Remod_Diff', "Total SF", "Gr Liv Area", "Garage Area", "Overall Qual", "Full Bath"]
ordinal_features = [ "Exter Qual",  "Kitchen Qual"]
cat_feature = ["Neighborhood"]
all_col = numeric_features.copy()
all_col.extend(ordinal_features)
all_col.extend(cat_feature)

X = df[all_col]
y = df[["SalePrice"]]

X["Total SF"].fillna(1052, inplace=True)
X["Garage Area"].fillna(472, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Total SF"].fillna(1052, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Garage Area"].fillna(472, inplace=True)


In [20]:
X.columns = ['Year_Built', 'Total SF', 'Gr_Liv_Area','Garage_Area', 'Overall_Qual', 'Full_Bath', 'Exter_Qual',
       'Kitchen_Qual', 'Neighborhood','Year_Remod_Diff']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
numeric_transformer = SimpleImputer()

exter_cat = [ 'Po', 'Fa','TA', 'Gd','Ex']
kitchen_cat = [ 'Po', 'Fa','TA', 'Gd',"Ex"]

ordinal_transformer = OrdinalEncoder(categories=[exter_cat, kitchen_cat])

categorical_transformer = OneHotEncoder()

In [23]:
numeric_features = ["Year_Built", "Total SF", "Gr_Liv_Area", "Garage_Area", "Overall_Qual", "Full_Bath",'Year_Remod_Diff']
ordinal_features = [ "Exter_Qual",  "Kitchen_Qual"]
cat_feature = ["Neighborhood"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, cat_feature)
    ]
)

In [24]:
reg = LinearRegression()

In [25]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('reg', reg)
])

pipe.fit(X_train, y_train) 

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'NWAmes'

In [16]:
pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  20278.24764595024
MAE on test data:  21449.72410504371


In [83]:
filename = '../real_estate_app/main_app/static/models/finalized_model.pkl'
pickle.dump(pipe, open(filename, 'wb'))

In [84]:
dico = {'Year_Built': 2000, 'Total_Bsmt_SF': 1, '1st_Flr_SF': 1, 'Gr_Liv_Area': 1, 'Garage_Area': 1, 'Overall_Qual': 1, 'Full_Bath': 1, 'Exter_Qual': 'Po', 'Kitchen_Qual': 'Po', 'Neighborhood': 'Gilbert'}

In [None]:
help(pipe.predict)

In [86]:

# Assuming dico is a dictionary
df = pd.DataFrame(list(dico.values()))
predictions = pipe.predict(df)

KeyError: "None of [Index(['Year_Built', 'Total_Bsmt_SF', '1st_Flr_SF', 'Gr_Liv_Area',\n       'Garage_Area', 'Overall_Qual', 'Full_Bath'],\n      dtype='object')] are in the [columns]"

In [None]:
pipe.score({dico})

TypeError: unhashable type: 'dict'

In [None]:
# Création du pipeline incluant le préprocesseur et le modèle
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('lasso', Lasso())])  # Lasso est utilisé comme exemple, vous pouvez utiliser n'importe quel modèle

# Paramètres à rechercher pour Lasso
param_grid = {'lasso__alpha': [0.01, 0.1, 1, 10, 100]}  # Valeurs d'alpha pour Lasso

# Recherche des meilleurs paramètres avec GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Affichage des meilleurs paramètres et scores
print("Meilleurs paramètres:", grid_search.best_params_)
print("Meilleur score:", -grid_search.best_score_)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge

# Définition des modèles
lasso = Lasso()
ridge = Ridge()

# Paramètres à rechercher pour chaque modèle
param_grid_lasso = {'alpha': [0.01, 0.1, 1, 10, 100]}  # Valeurs d'alpha pour Lasso
param_grid_ridge = {'alpha': [0.01, 0.1, 1, 10, 100]}  # Valeurs d'alpha pour Ridge

# Recherche des meilleurs paramètres pour Lasso
grid_search_lasso = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='neg_mean_absolute_error')
grid_search_lasso.fit(X_train, y_train)

# Recherche des meilleurs paramètres pour Ridge
grid_search_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_absolute_error')
grid_search_ridge.fit(X_train, y_train)

# Affichage des meilleurs paramètres et scores pour Lasso
print("Meilleurs paramètres pour Lasso:", grid_search_lasso.best_params_)
print("Meilleur score pour Lasso:", -grid_search_lasso.best_score_)

# Affichage des meilleurs paramètres et scores pour Ridge
print("Meilleurs paramètres pour Ridge:", grid_search_ridge.best_params_)
print("Meilleur score pour Ridge:", -grid_search_ridge.best_score_)


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
Traceback (most recent call last):
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/home/apprenant/miniconda3/envs/e2_p/lib/python3.10/site-packages/sklearn/

Meilleurs paramètres: {'lasso__alpha': 0.01}
Meilleur score: nan


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet,ElasticNetCV
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

alpha = np.linspace(0.01,1,100)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Assuming you have a preprocessor object called 'preprocessor'
pipeline = make_pipeline(preprocessor, RidgeCV(alphas=alpha, cv=7))
pipeline.fit(X_train, y_train)

best_alpha = pipeline.steps[-1][1].alpha_

In [None]:
best_alpha

0.22

In [None]:
from sklearn.linear_model import Ridge

# Create the final Ridge regression model with the best alpha
ridge_model = make_pipeline(preprocessor,Ridge(alpha=best_alpha))

# Fit the model on the full training data
ridge_model.fit(X_train, y_train)

In [None]:
ridge_model.score(X_test,y_test)

predict_train  = ridge_model.predict(X_train)
predict_test  = ridge_model.predict(X_test)

# Root Mean Squared Error on train and test date
print('MAE on train data: ', mean_absolute_error(y_train, predict_train))
print('MAE on test data: ',  mean_absolute_error(y_test, predict_test))

MAE on train data:  22546.72162314922
MAE on test data:  24119.364193839177


In [None]:
alpha = np.linspace(0.01,1,100)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Assuming you have a preprocessor object called 'preprocessor'
pipeline = make_pipeline(preprocessor, LassoCV(alphas=alpha, cv=7))
pipeline.fit(X_train, y_train)

best_alpha = pipeline.steps[-1][1].alpha_

  y = column_or_1d(y, warn=True)


In [None]:
# Create the final Ridge regression model with the best alpha
lasso_model = make_pipeline(preprocessor,Lasso(alpha=best_alpha))

# Fit the model on the full training data
lasso_model.fit(X_train, y_train)