# Pipeline et modèle

In [11]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import pickle

In [12]:
df = pd.read_csv("../data/AmesHousing.csv")

Il ne faut selectionner que 10 features, on supprime Neighborhood qui est trop complexe à traiter, ainsi que bsmt Qual et garage Finish qui sont trop proches d'autres variables

In [13]:
numeric_features = ["Year Built", "Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Area", "Overall Qual", "Full Bath"]
ordinal_features = [ "Exter Qual",  "Kitchen Qual"]
cat_feature = ["Neighborhood"]
all_col = numeric_features.copy()
all_col.extend(ordinal_features)
all_col.extend(cat_feature)

X = df[all_col]
y = df[["SalePrice"]]

X["Total Bsmt SF"].fillna(1052, inplace=True)
X["Garage Area"].fillna(472, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Total Bsmt SF"].fillna(1052, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Garage Area"].fillna(472, inplace=True)


In [14]:
X.columns = ['Year_Built', 'Total_Bsmt_SF', '1st_Flr_SF', 'Gr_Liv_Area','Garage_Area', 'Overall_Qual', 'Full_Bath', 'Exter_Qual',
       'Kitchen_Qual', 'Neighborhood']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
numeric_transformer = SimpleImputer()

exter_cat = [ 'Po', 'Fa','TA', 'Gd','Ex']
kitchen_cat = [ 'Po', 'Fa','TA', 'Gd',"Ex"]

ordinal_transformer = OrdinalEncoder(categories=[exter_cat, kitchen_cat])

categorical_transformer = OneHotEncoder()

In [17]:
numeric_features = ["Year_Built", "Total_Bsmt_SF", "1st_Flr_SF", "Gr_Liv_Area", "Garage_Area", "Overall_Qual", "Full_Bath"]
ordinal_features = [ "Exter_Qual",  "Kitchen_Qual"]
cat_feature = ["Neighborhood"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, cat_feature)
    ]
)

In [18]:
reg = LinearRegression()

In [19]:
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('reg', reg)
])

pipe.fit(X_train, y_train) 

In [20]:
score = pipe.score(X_test,y_test)

predict_train  = pipe.predict(X_train)
predict_test  = pipe.predict(X_test)

mae_train = mean_absolute_error(y_train, predict_train)
mae_test = mean_absolute_error(y_test, predict_test)

print('R2: ', score)
print('MAE on train data: ', mae_train)
print('MAE on test data: ',  mae_test)

R2:  0.8464200968709472
MAE on train data:  20289.33639741511
MAE on test data:  21324.92012401977


In [21]:
modele_df = pd.DataFrame(columns=['model', 'R2', 'MAE_train', 'MAE_test'])

In [22]:
score

0.8464200968709472

In [23]:
col_dict = {
    'model': ["LinearRegression initial"],
    'R2': [score],
    'MAE_train': [mae_train],
    'MAE_test': [mae_test]
}

new_model = pd.DataFrame(col_dict)

name_model = new_model["model"].iloc[0]

modele_df = modele_df.append(new_model, ignore_index=True)

modele_df.to_csv(f"model-csv/{name_model}.csv", index=False)

  modele_df = modele_df.append(new_model, ignore_index=True)


In [24]:
modele_df

Unnamed: 0,model,R2,MAE_train,MAE_test
0,LinearRegression initial,0.84642,20289.336397,21324.920124


In [25]:
# filename = '../real_estate_app/main_app/static/models/finalized_model.pkl'
# pickle.dump(pipe, open(filename, 'wb'))