In [1]:
#  import main packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# define variables
immo_vis_dir = "../../data/immo_vis/"
parquet_extension = ".parquet"
metropole_appartement_file = "ventes-metropole-appartement" + parquet_extension
metropole_maison_file = "ventes-metropole-maison" + parquet_extension
metropole_appartement_file_cleaned = metropole_appartement_file + "_step1_clean_" + parquet_extension

#  Set plot options
LargePlotActive = False
MediumPlotActive = False
SmallPlotActive = False
HeatMapDisplay = False



# # functions

In [2]:
# functions  
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

import time

def print_numerical_isna(df) :
    columns = df.select_dtypes(include='number').columns.tolist()
    print (df[columns].isna().sum())

def print_categorial_isna(df) :
    columns = df.select_dtypes(include='category').columns.tolist()
    print (df[columns].isna().sum())     

def load_appartement_file (filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    return pd.read_parquet(final_path.as_posix())

def save_appartement_file (df, filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    df.to_parquet(path=final_path.as_posix(),index=True)

def get_numerical_column_names (df) :
    return [ column for column  in df.columns if df[column].dtype not in ["object","category"]]

def apply_scale_processing(operator,X_train,X_test):
    #Operator could be scaler or encode
    op=operator
    op.fit(X_train)
    X_train_treated=pd.DataFrame(op.transform(X_train),columns=X_train.columns)
    X_test_treated=pd.DataFrame(op.transform(X_test),columns=X_train.columns)
    return X_train_treated,X_test_treated

def apply_preprocessing  (df) :
    y_tmp =df['prix_bien']
    X_tmp =df.drop(columns=['prix_m2_vente','prix_bien','mapCoordonneesLatitude','mapCoordonneesLongitude','date'])
    return X_tmp,y_tmp

def create_train_test_data (X ,y) :
    from sklearn.model_selection import train_test_split
    #  build the train and test data
    print (X.shape)
    return train_test_split(X,y,test_size=0.2,random_state=42)

def create_train_test_data_subset (X ,y,subset_size = 1.0) :
    from sklearn.model_selection import train_test_split
    #  build the train and test data
    print (X.shape)
    return train_test_split(X,y,test_size=0.2,train_size=subset_size, random_state=42)

# # decode file and display infos

In [3]:
# import appartement file

df = load_appartement_file(metropole_appartement_file_cleaned)
nb_rows= df.shape[0]
nb_cols= df.shape[1]

In [4]:
#  print main infos on appartmeent file

print ( f" {metropole_appartement_file_cleaned} rows {nb_rows} columns {nb_cols}")
print (df.info())



 ventes-metropole-appartement.parquet_step1_clean_.parquet rows 2077476 columns 47
<class 'pandas.core.frame.DataFrame'>
Index: 2077476 entries, immo-facile-57743459 to 134327817
Data columns (total 47 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   etage                    int64         
 1   surface                  int64         
 2   nb_pieces                int64         
 3   prix_bien                int64         
 4   balcon                   int64         
 5   eau                      int64         
 6   bain                     int64         
 7   mapCoordonneesLatitude   float64       
 8   mapCoordonneesLongitude  float64       
 9   annonce_exclusive        int64         
 10  nb_etages                float64       
 11  places_parking           float64       
 12  cave                     int64         
 13  annee_construction       float64       
 14  nb_toilettes             int64         
 15  nb_terraces       

# 1st Model Encoding

In [5]:
# check dtypes
display(df.dtypes)
print("porte_digicode",df.porte_digicode.unique())
print("ascenceur",df.ascenseur.unique())
print("cave",df.ascenseur.unique())

etage                               int64
surface                             int64
nb_pieces                           int64
prix_bien                           int64
balcon                              int64
eau                                 int64
bain                                int64
mapCoordonneesLatitude            float64
mapCoordonneesLongitude           float64
annonce_exclusive                   int64
nb_etages                         float64
places_parking                    float64
cave                                int64
annee_construction                float64
nb_toilettes                        int64
nb_terraces                       float64
videophone                          int64
porte_digicode                      int64
ascenseur                           int64
nb_logements_copro                float64
charges_copro                     float64
logement_neuf                       int64
date                       datetime64[ns]
CODE_IRIS                         

porte_digicode [0 1]
ascenceur [0 1]
cave [0 1]


# # Preprocessing

In [6]:
# set target and data
print (df.columns)
X,y = apply_preprocessing(df)

Index(['etage', 'surface', 'nb_pieces', 'prix_bien', 'balcon', 'eau', 'bain',
       'mapCoordonneesLatitude', 'mapCoordonneesLongitude',
       'annonce_exclusive', 'nb_etages', 'places_parking', 'cave',
       'annee_construction', 'nb_toilettes', 'nb_terraces', 'videophone',
       'porte_digicode', 'ascenseur', 'nb_logements_copro', 'charges_copro',
       'logement_neuf', 'date', 'CODE_IRIS', 'REG', 'DEP', 'prix_m2_vente',
       'dpeL_num', 'ges_class_num', 'chauf_energy_gaz', 'chauf_energy_elec',
       'chauf_energy_bois', 'chauf_energy_fioul', 'chauf_sys_climatisation',
       'chauf_sys_pompe_chaleur', 'chauf_sys_convecteur',
       'chauf_sys_radiateur', 'chauf_sys_chaudiere', 'chauf_sys_sol',
       'chauf_sys_poele_bois', 'chauf_mode_individuel', 'chauf_mode_collectif',
       'chauf_mode_central', 'expo_has_nord', 'expo_has_sud', 'expo_has_est',
       'expo_has_ouest'],
      dtype='object')


#  # create train test dataset

In [7]:
#  train test split
from sklearn.model_selection import train_test_split
#  build the train and test data

X_train, X_test, y_train, y_test = create_train_test_data(X,y)

(2077476, 42)


In [10]:
# apply min max scaler
from sklearn.preprocessing import MinMaxScaler

X_train_scaled,X_test_scaled=apply_scale_processing(MinMaxScaler(),X_train,X_test)

# # train ans predict models

In [8]:
def fit_train_models (models,X_train, y_train,X_test,y_test) :
    results = []
    for name, model in models.items():
        print(f"Entraînement de {name}...")
        
        # Mesurer le temps d'entraînement
        start_time = time.time()
        model.fit(X_train, y_train)
        fit_time = time.time() - start_time
        
        # Mesurer le temps de prédiction
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time
        
        # Calculer les métriques
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R²': r2,
            'Fit_Time': f"{fit_time:.4f}s",
            'Predict_Time': f"{predict_time:.4f}s",
            'Total_Time': f"{fit_time + predict_time:.4f}s"
        })

    # Afficher les résultats
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('RMSE')


    return results_df


In [11]:

import lightgbm as lgb
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression, Ridge
from sklearn.svm import SVR, LinearSVR

# Liste des modèles à tester
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'ElasticNetCV': ElasticNetCV(),
    'LinearSVR': LinearSVR(),
    # 'SVR': SVR(),
    # 'Random Forest': RandomForestRegressor(n_estimators=20, random_state=42),
    'lgb' : lgb.LGBMRegressor(
                            objective='regression',
                            metric='rmse',
                            num_leaves=31,
                            learning_rate=0.05,
                            force_row_wise=True,
                            # feature_fraction=0.9,
                        )
}

# Tester chaque modèle avec mesure du temps
results = fit_train_models(models,X_train_scaled,y_train,X_test_scaled,y_test)

# Afficher les résultats
print(results)


Entraînement de LinearRegression...
Entraînement de Ridge...
Entraînement de ElasticNet...
Entraînement de ElasticNetCV...
Entraînement de LinearSVR...
Entraînement de lgb...
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 1661980, number of used features: 40
[LightGBM] [Info] Start training from score 305724.749736
              Model           RMSE        R²  Fit_Time Predict_Time Total_Time
5               lgb  127904.592827  0.790318   3.6342s      0.2355s    3.8697s
0  LinearRegression  205814.163764  0.457077   3.6157s      0.0534s    3.6690s
1             Ridge  206035.274962  0.455909   0.4553s      0.0226s    0.4778s
4         LinearSVR  262389.232828  0.117571   3.8611s      0.0350s    3.8961s
2        ElasticNet  262596.119190  0.116178   3.4031s      0.0159s    3.4189s
3      ElasticNetCV  278863.585536  0.003284  13.7551s      0.0500s   13.8051s


In [9]:



# Liste des modèles à tester
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


models = {
    # 'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=5),
    'XGBRegressor' : XGBRegressor(n_estimators=100, random_state=42),
    'DecisionTreeRegressor': DecisionTreeRegressor(max_depth=10, random_state=42),
    # 'GradientBoostingRegressor': GradientBoostingRegressor(n_estimators=20, random_state=42),
    # 'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
}

# Tester chaque modèle avec mesure du temps

results = fit_train_models(models,X_train,y_train,X_test,y_test)

# Afficher les résultats
print(results)


Entraînement de XGBRegressor...
Entraînement de DecisionTreeRegressor...
                   Model           RMSE        R²  Fit_Time Predict_Time  \
0           XGBRegressor  112099.476562  0.838937  32.9283s      0.2059s   
1  DecisionTreeRegressor  131380.640179  0.778767   9.7468s      0.1335s   

  Total_Time  
0   33.1342s  
1    9.8802s  


#  # improve hyper parameters 

In [7]:
from sklearn.preprocessing import MinMaxScaler

X1,y1 = apply_preprocessing(df)
X_train1, X_test1, y_train1,y_test1 = create_train_test_data_subset(X1,y1,20000)
X_train_scaled1,X_test_scaled1 = apply_scale_processing(MinMaxScaler(),X_train1,X_test1)


(2077476, 42)


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  mean_absolute_error, mean_squared_error

def optimize_hyperparameters_grid_search (estimator_name, estimator, param_grid,X_train, y_train,X_test,y_test) :
# Initialize XGBRegressor

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='neg_mean_squared_error',  # or 'r2', 'neg_mean_absolute_error'
        verbose=2,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit the grid search
    print(f"Starting GridSearchCV {estimator_name} fit ...")
    grid_search.fit(X_train, y_train)
    print(f"Ending GridSearchCV {estimator_name} fit ...")

    # Best parameters and score
    print(f"Best estimator : {estimator_name} {grid_search.best_estimator_}")
    print(f"Best paramters : {estimator_name} {grid_search.best_params_}")
    print(f"Best score : {estimator_name} {grid_search.best_params_}")

    print("\n" + "="*60)
    print("BEST PARAMETERS:")
    print("="*60)
    for param, value in grid_search.best_params_.items():
        print(f"{param:20s}: {value}")

    print(f"\nBest CV Score (neg MSE): {grid_search.best_score_:.4f}")
    print(f"Best CV RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

    # Train best model on full training set and evaluate
    print(f"Starting GridSearchCV {estimator_name} predict ...")
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n" + "="*60)
    print("TEST SET PERFORMANCE:")
    print("="*60)
    print(f"MSE:  {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R²:   {r2:.4f}")

    # Display top 10 parameter combinations
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df['mean_rmse'] = np.sqrt(-results_df['mean_test_score'])
    top_10 = results_df.nsmallest(10, 'mean_rmse')[['params', 'mean_rmse', 'std_test_score']]
    print("\n" + "="*60)
    print("TOP 10 PARAMETER COMBINATIONS:")
    print("="*60)
    print(top_10.to_string(index=False))

    # Feature importance plot
    feature_importance = best_model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance)), feature_importance)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Index')
    plt.title('Feature Importance from Best XGBRegressor Model')
    plt.tight_layout()
   

    # Save the best model
    # import joblib
    # joblib.dump(best_model, 'best_xgb_model.pkl')
    # print("Best model saved as 'best_xgb_model.pkl'")

#  # hyper parameters XgbRegressor

In [None]:

from xgboost import XGBRegressor


# Define parameter grid
param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}
param_grid1 = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_lambda': [0.1, 1.0, 10.0],
    'reg_alpha': [0, 0.1, 1.0]
}

# Initialize XGBRegressor
xgb = XGBRegressor(random_state=42, n_jobs=-1)

optimize_hyperparameters_grid_search("XGBRegressor", xgb,param_grid1,X_train_scaled1, y_train1,X_test_scaled1,y_test1)


#  # hyper parameters DecisionTreeRegressor

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd


# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'splitter': ['best', 'random'],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}

# Initialize model
dt = DecisionTreeRegressor(random_state=42)

optimize_hyperparameters_grid_search("DecisionTreeRegressor", dt,param_grid,X_train_scaled1, y_train1,X_test_scaled1,y_test1)



Starting GridSearchCV DecisionTreeRegressor fit ...
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


: 