In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [14]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [15]:
# Suppression des colonnes avec beaucoup de valeurs manquantes et autres colonnes non nécessaires
df = df.drop(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'MSSubClass', 'MSZoning'], axis=1)

In [16]:
def preprocess(df):
    # Mapping pour remplacer les valeurs textuelles par des nombres
    mappings = {
        "BsmtQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtExposure": {"No": 1, "Mn": 2, "Av": 3, "Gd": 4},
        "BsmtFinType1": {"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
        "BsmtFinType2": {"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
        "FireplaceQu": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "GarageQual": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "GarageCond": {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    }
    for col, mapping in mappings.items():
        if col in df.columns:
            df[col] = df[col].fillna(0).replace(mapping)
            df[col] = df[col].astype(int)  # Assurer que le type est bien int
            
    # Remplir les valeurs manquantes pour les autres colonnes
    fill_values = {
        "LotFrontage": df["LotFrontage"].mean() if "LotFrontage" in df.columns else 0,
        "MasVnrArea": 0,
        "Electrical": df["Electrical"].mode()[0] if "Electrical" in df.columns else "Unknown",
        "GarageType": "No Garage",
        "GarageYrBlt": 0,
        "GarageFinish": "No Garage",
        "Utilities": "AllPub",
        "Exterior1st": "VinylSd",
        "Exterior2nd": "VinylSd",
        "BsmtFinSF2": 0,
        "BsmtUnfSF": 0,
        "TotalBsmtSF": 0,
        "BsmtFinSF1": 0,
        "BsmtFullBath": 0,
        "BsmtHalfBath": 0,
        "KitchenQual": "TA",
        "Functional": "Typ",
        "GarageCars": 2,
        "GarageArea": 0,
        "SaleType": "WD"
    }
    for col, val in fill_values.items():
        if col in df.columns:
            df[col] = df[col].fillna(val)
    return df

In [17]:
df = preprocess(df)

  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)


In [18]:
def get_categorical_columns(df, include_bool=True):
    # Initialize categories list
    categorical_columns = []
    
    for column in df.columns:
        # Skip ID columns
        if column.lower() in ['id', 'salesid', 'saleid', 'sale_id']:
            continue
            
        # Get column data type and number of unique values
        dtype = df[column].dtype
        nunique = df[column].nunique()
        
        # Check if column is categorical based on various criteria
        is_categorical = (
            # Explicit object or category dtype
            dtype == 'object' or 
            dtype == 'category' or
            # Boolean columns if include_bool is True
            (include_bool and dtype == 'bool') or
            # Integer columns with low cardinality (likely encoded categories)
            (dtype in ['int64', 'int32'] and nunique < 20) or
            # Check if column name suggests categorical
            any(hint in column.lower() for hint in ['type', 'category', 'code', 'quality', 'condition', 'class'])
        )
        
        if is_categorical:
            categorical_columns.append(column)
    
    return categorical_columns

In [19]:
def create_consistent_encoding_sklearn(train_df, test_df, categorical_columns):
    # Initialize encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Fit on training data
    encoder.fit(train_df[categorical_columns])
    
    # Transform both datasets
    train_encoded = encoder.transform(train_df[categorical_columns])
    test_encoded = encoder.transform(test_df[categorical_columns])
    
    # Convert to DataFrames with proper column names
    feature_names = encoder.get_feature_names_out(categorical_columns)
    train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
    test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)
    
    # Combine with non-categorical columns
    train_final = pd.concat([
        train_df.select_dtypes(exclude=['object']),
        train_encoded_df
    ], axis=1)
    
    test_final = pd.concat([
        test_df.select_dtypes(exclude=['object']),
        test_encoded_df
    ], axis=1)
    
    return train_final, test_final, encoder

In [20]:
# Préparation des données
X = df.drop("SalePrice", axis=1)
Y = df["SalePrice"]

In [21]:
# Identification des colonnes catégorielles
categorical_columns = get_categorical_columns(X)

In [22]:
# Chargement des données de test
subm_data = pd.read_csv("test.csv")
ids = subm_data["Id"]
subm_data = preprocess(subm_data)

  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)
  df[col] = df[col].fillna(0).replace(mapping)


In [23]:
# Encodage one-hot des variables catégorielles
X, subm_data, encoder = create_consistent_encoding_sklearn(X, subm_data, categorical_columns)

In [24]:
# Sauvegarde de l'encodeur pour une utilisation ultérieure
import joblib
joblib.dump(encoder, "one_hot_encoder.joblib")

['one_hot_encoder.joblib']

In [25]:
# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

In [26]:
# Normalisation des données pour le deep learning
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
subm_data_scaled = scaler.transform(subm_data)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Id
- MSSubClass


In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import pickle

In [28]:
# Construction du modèle Deep Learning
def build_deep_model(input_dim):
    model = Sequential()
    
    # Première couche cachée avec plus de neurones
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # Couches intermédiaires
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    
    # Couche de sortie (pas d'activation pour la régression)
    model.add(Dense(1))
    
    # Compilation du modèle
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

In [29]:
# Création du modèle
input_dim = X_train_scaled.shape[1]
model = build_deep_model(input_dim)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [30]:
# Définition des callbacks pour améliorer l'entraînement
callbacks = [
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001),
    ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
]

In [32]:
# Entraînement du modèle
history = model.fit(
    X_train, y_train,
    epochs=200,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/200
[1m24/30[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - loss: 38232154112.0000



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 37895356416.0000 - val_loss: 41456693248.0000 - learning_rate: 0.0010
Epoch 2/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 37263888384.0000 - val_loss: 41502969856.0000 - learning_rate: 0.0010
Epoch 3/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 36462493696.0000 - val_loss: 41611149312.0000 - learning_rate: 0.0010
Epoch 4/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 36080087040.0000 - val_loss: 41686888448.0000 - learning_rate: 0.0010
Epoch 5/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 38257405952.0000 - val_loss: 41733402624.0000 - learning_rate: 0.0010
Epoch 6/200
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 36252778496.0000 - val_loss: 41781567488.0000 - learning_rate: 0.0010
Epoch 7/200
[1m30/30[0m [3

In [33]:
# Évaluation du modèle sur l'ensemble d'entraînement
y_pred_train = model.predict(X_train).flatten()
y_pred_train = np.expm1(y_pred_train)  
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print("\nPerformance du modèle sur l'ensemble d'entraînement:")
print(f"Mean Squared Error: {mse_train:.2f}")
print(f"R² Score: {r2_train:.3f}")

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


  y_pred_train = np.expm1(y_pred_train)


ValueError: Input contains infinity or a value too large for dtype('float32').