In [1]:
import pandas as pd

# Charger les données
data = pd.read_csv('../data/Housing.csv')

# Aperçu des données
print(data.head())
print(data.info())
print(data.describe())


      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 colu

In [4]:
import pandas as pd

# Charger les données
data = pd.read_csv('../data/Housing.csv')

# Liste des colonnes booléennes
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Encodage yes/no en 1/0
for col in binary_cols:
    data[col] = data[col].map({'yes': 1, 'no': 0})

# Vérifier les premières lignes après encodage
print(data.head())


      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

  furnishingstatus  
0        furnished  
1        furnished  
2   semi-furnished  
3        furnished  
4        furnished  


In [5]:
# Encodage one-hot de la colonne furnishingstatus
data = pd.get_dummies(data, columns=['furnishingstatus'], drop_first=True)

# Vérification des colonnes après encodage
print(data.head())


      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False                         False  
1                       

In [6]:
from sklearn.preprocessing import StandardScaler

# Colonnes numériques à standardiser
numeric_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Standardisation
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Vérification des premières lignes après standardisation
print(data.head())


      price      area  bedrooms  bathrooms   stories  mainroad  guestroom  \
0  13300000  1.046726  1.403419   1.421812  1.378217         1          0   
1  12250000  1.757010  1.403419   5.405809  2.532024         1          0   
2  12250000  2.218232  0.047278   1.421812  0.224410         1          0   
3  12215000  1.083624  1.403419   1.421812  0.224410         1          0   
4  11410000  1.046726  1.403419  -0.570187  0.224410         1          1   

   basement  hotwaterheating  airconditioning   parking  prefarea  \
0         0                0                1  1.517692         1   
1         0                0                1  2.679409         0   
2         1                0                0  1.517692         1   
3         1                0                1  2.679409         1   
4         1                0                1  1.517692         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False                     

In [7]:
from sklearn.model_selection import train_test_split

# Séparation des caractéristiques (X) et de la cible (y)
X = data.drop(columns=['price'])
y = data['price']

# Division en ensembles d'entraînement, de validation et de test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

# Vérification des tailles des ensembles
print(f"Taille Entraînement : {X_train.shape}, Validation : {X_val.shape}, Test : {X_test.shape}")


Taille Entraînement : (381, 13), Validation : (109, 13), Test : (55, 13)


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialisation du modèle
model = LinearRegression()

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédictions sur l'ensemble de validation
y_val_pred = model.predict(X_val)

# Évaluation des performances
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Mean Squared Error (Validation) : {mse}")
print(f"R2 Score (Validation) : {r2}")


Mean Squared Error (Validation) : 1682917323390.779
R2 Score (Validation) : 0.6411209053296594


In [9]:
# Prédictions sur l'ensemble de test
y_test_pred = model.predict(X_test)

# Évaluation des performances sur l'ensemble de test
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Mean Squared Error (Test) : {mse_test}")
print(f"R2 Score (Test) : {r2_test}")


Mean Squared Error (Test) : 1206130995429.3965
R2 Score (Test) : 0.6513313384781632


In [10]:
import joblib

# Sauvegarde du modèle dans le dossier models
joblib.dump(model, '../models/house_price_model.pkl')
print("Modèle sauvegardé dans '../models/house_price_model.pkl'.")


Modèle sauvegardé dans '../models/house_price_model.pkl'.
