In [1]:
# =========================
# Import des bibliothèques
# =========================
# numpy : calcul numérique
# pandas : manipulation des données tabulaires
# matplotlib / seaborn : visualisation des données

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Affiche toutes les colonnes pour une meilleure lisibilité
pd.set_option('display.max_columns', None)

# Style graphique standard
sns.set(style="whitegrid")

In [7]:
# =========================
# Chargement des données
# =========================
# train : contient les variables explicatives + le prix (target)
# test : contient uniquement les variables explicatives

train = pd.read_csv("../src/data/train.csv")
test = pd.read_csv("../src/data/test.csv")

# Aperçu général du dataset
train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
y = train['SalePrice']
X = train.drop(['SalePrice', 'Id'], axis=1)

test_ids = test['Id']
X_test = test.drop(['Id'], axis=1)
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns
num_imputer = SimpleImputer(strategy='median')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])
cat_imputer = SimpleImputer(strategy='constant', fill_value='None')
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(X[cat_cols])
X_test_cat = encoder.transform(X_test[cat_cols])
X_num = X[num_cols].values
X_test_num = X_test[num_cols].values

X_final = np.hstack([X_num, X_cat])
X_test_final = np.hstack([X_test_num, X_test_cat])
y_log = np.log(y)


In [12]:
# =========================
# Séparation train / validation
# =========================
# Permet d'évaluer les performances sur des données non vues

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_final, y_log,
    test_size=0.2,
    random_state=42
)


In [14]:
# =========================
# Définition de la métrique
# =========================
# RMSE :
# - pénalise fortement les grandes erreurs
# - métrique standard pour la régression

from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


In [15]:
# Modèle de référence (baseline)

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

rmse_lr = rmse(y_val, lr.predict(X_val))


In [16]:
# Régularisation pour limiter le sur-apprentissage

from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.0005)

ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

rmse_ridge = rmse(y_val, ridge.predict(X_val))
rmse_lasso = rmse(y_val, lasso.predict(X_val))


In [17]:
# Modèle non linéaire basé sur des arbres

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)
rmse_rf = rmse(y_val, rf.predict(X_val))


In [21]:
# Modèle de boosting séquentiel très performant

from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr.fit(X_train, y_train)
rmse_gbr = rmse(y_val, gbr.predict(X_val))


In [22]:
# =========================
# Validation croisée
# =========================
# Permet d'évaluer la robustesse du modèle

from sklearn.model_selection import KFold, cross_val_score

cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    gbr, X_final, y_log,
    scoring='neg_root_mean_squared_error',
    cv=cv
)

rmse_moy = -scores.mean()          # RMSE moyenne (positive)
rmse_std =  scores.std(ddof=1)     # Écart-type des scores (toujours négatifs)
rmse_std_pos = rmse_std            # même valeur en absolu

print(f"RMSE (CV=5) : {rmse_moy:.4f} ± {rmse_std_pos:.4f}")



RMSE (CV=5) : 0.1305 ± 0.0234


In [23]:
#4

In [24]:
# =========================
# Comparaison finale
# =========================

results = pd.DataFrame({
    "Modèle": ["Linear", "Ridge", "Lasso", "Random Forest", "Gradient Boosting"],
    "RMSE": [rmse_lr, rmse_ridge, rmse_lasso, rmse_rf, rmse_gbr]
})

results.sort_values(by="RMSE")


Unnamed: 0,Modèle,RMSE
2,Lasso,0.128315
1,Ridge,0.131144
0,Linear,0.13214
4,Gradient Boosting,0.135525
3,Random Forest,0.145935
