In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# ============================================
# Étape 2 : Chargement des données
# ============================================

# Charger le fichier train.csv (Kaggle / Colab)
df = pd.read_csv("C:\\Users\\ChamsYAHIA\\OneDrive - Arion Technologie\\Documents\\pi\\IRM 3\\MLOPS\\house_price_mlops\\train.csv")
# Si Colab : df = pd.read_csv("train.csv")

df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# ============================================
# Étape 3 : Sélection des features numériques
# ============================================

# Garder uniquement les colonnes numériques
df_num = df.select_dtypes(include=["int64", "float64"])

# Supprimer les lignes avec valeurs manquantes
df_num = df_num.dropna()

df_num.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [4]:
# ============================================
# Étape 4 : Séparer X (features) et y (SalePrice)
# + Stratification par binning
# ============================================

# Définir la target
y = df_num["SalePrice"]

# Supprimer la target des features
X = df_num.drop(columns=["SalePrice"])

# Stratification par binning
y_binned = pd.qcut(y, q=10, duplicates="drop")

y_binned.head()


0    (202900.0, 239000.0]
1    (180000.0, 202900.0]
2    (202900.0, 239000.0]
3    (136500.0, 148000.0]
4    (239000.0, 290000.0]
Name: SalePrice, dtype: category
Categories (10, interval[float64, right]): [(35310.999, 109500.0] < (109500.0, 125000.0] < (125000.0, 136500.0] < (136500.0, 148000.0] ... (180000.0, 202900.0] < (202900.0, 239000.0] < (239000.0, 290000.0] < (290000.0, 755000.0]]

In [5]:
# ============================================
# Étape 5 : Split train / test
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y_binned
)

X_train.shape, X_test.shape


((896, 37), (225, 37))

In [6]:
# ============================================
# Étape 6 : Entraînement du modèle LinearRegression
# ============================================

model = LinearRegression()
model.fit(X_train, y_train)


In [7]:
# ============================================
# Étape 7 : Évaluation du modèle
# ============================================

def evaluate(model, X, y, label=""):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    print(f"--- {label} ---")
    print(f"RMSE : {rmse}")
    print(f"R²   : {r2}\n")

evaluate(model, X_train, y_train, "Train")
evaluate(model, X_test, y_test, "Test")


--- Train ---
RMSE : 37943.04319399305
R²   : 0.7945353068120273

--- Test ---
RMSE : 28779.98779633049
R²   : 0.8703307991235971



In [8]:
# ============================================
# Étape 8 : Sauvegarde du modèle
# ============================================

joblib.dump(model, "model.pkl")

print("✔ Modèle sauvegardé sous model.pkl")


✔ Modèle sauvegardé sous model.pkl


In [9]:
df = pd.read_csv("train.csv")
