In [20]:
#  import main packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# define variables
immo_vis_dir = "../../data/immo_vis/"
parquet_extension = ".parquet"
metropole_appartement_file = "ventes-metropole-appartement" + parquet_extension
metropole_maison_file = "ventes-metropole-maison" + parquet_extension
metropole_appartement_file_cleaned = metropole_appartement_file + "_step1_clean_" + parquet_extension

acp_appartement_file = "df_ACP_IRIS_immo_Processed"  + parquet_extension

#  Set plot options
LargePlotActive = False
MediumPlotActive = False
SmallPlotActive = False
HeatMapDisplay = False

lgbRegressionON = False
XGBRegressorON = False
GridSearchXGBRegressor =False
GridSearchDecisionTree = True

# # functions

In [21]:
# functions  
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

import time

def print_numerical_isna(df) :
    columns = df.select_dtypes(include='number').columns.tolist()
    print (df[columns].isna().sum())

def print_categorial_isna(df) :
    columns = df.select_dtypes(include='category').columns.tolist()
    print (df[columns].isna().sum())     

def load_appartement_file (filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    return pd.read_parquet(final_path.as_posix())

def save_appartement_file (df, filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    df.to_parquet(path=final_path.as_posix(),index=True)

def get_numerical_column_names (df) :
    return [ column for column  in df.columns if df[column].dtype not in ["object","category"]]

def apply_scale_processing(operator,X_train,X_test):
    #Operator could be scaler or encode
    op=operator
    op.fit(X_train)
    X_train_treated=pd.DataFrame(op.transform(X_train),columns=X_train.columns)
    X_test_treated=pd.DataFrame(op.transform(X_test),columns=X_train.columns)
    return X_train_treated,X_test_treated

def apply_preprocessing  (df) :
    y_tmp =df['prix_bien']
    X_tmp =df.drop(columns=['prix_m2_vente','prix_bien','mapCoordonneesLatitude','mapCoordonneesLongitude','date',"expo_double"])
    return X_tmp,y_tmp

def create_train_test_data (X ,y) :
    from sklearn.model_selection import train_test_split
    #  build the train and test data
    print (X.shape)
    return train_test_split(X,y,test_size=0.2,random_state=42)

def create_train_test_data_subset (X ,y,subset_size = 1.0) :
    from sklearn.model_selection import train_test_split
    #  build the train and test data
    print (X.shape)
    return train_test_split(X,y,test_size=0.2,train_size=subset_size, random_state=42)

# # decode file and display infos

In [22]:
# import appartement file

df_processed= load_appartement_file(metropole_appartement_file_cleaned)
nb_rows= df_processed.shape[0]
nb_cols= df_processed.shape[1]

In [23]:
#  print main infos on appartmeent file

print ( f" {metropole_appartement_file_cleaned} rows {nb_rows} columns {nb_cols}")
print (df_processed.info())



 ventes-metropole-appartement.parquet_step1_clean_.parquet rows 2077476 columns 48
<class 'pandas.core.frame.DataFrame'>
Index: 2077476 entries, immo-facile-57743459 to 134327817
Data columns (total 48 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   etage                    int64         
 1   surface                  int64         
 2   nb_pieces                int64         
 3   prix_bien                int64         
 4   balcon                   int64         
 5   eau                      int64         
 6   bain                     int64         
 7   mapCoordonneesLatitude   float64       
 8   mapCoordonneesLongitude  float64       
 9   annonce_exclusive        int64         
 10  nb_etages                float64       
 11  places_parking           float64       
 12  cave                     int64         
 13  annee_construction       float64       
 14  nb_toilettes             int64         
 15  nb_terraces       

# Decode ACP file 

In [24]:
df_iris = load_appartement_file(acp_appartement_file)
nb_rows= df_iris.shape[0]
nb_cols= df_iris.shape[1]

print ( f" {acp_appartement_file} rows {nb_rows} columns {nb_cols}")


#  chop data

print (df_iris.info())

 df_ACP_IRIS_immo_Processed.parquet rows 46806 columns 21
<class 'pandas.core.frame.DataFrame'>
Index: 46806 entries, 0 to 46805
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CODE_IRIS  46806 non-null  int64  
 1   PC1        46806 non-null  float64
 2   PC2        46806 non-null  float64
 3   PC3        46806 non-null  float64
 4   PC4        46806 non-null  float64
 5   PC5        46806 non-null  float64
 6   PC6        46806 non-null  float64
 7   PC7        46806 non-null  float64
 8   PC8        46806 non-null  float64
 9   PC9        46806 non-null  float64
 10  PC10       46806 non-null  float64
 11  PC11       46806 non-null  float64
 12  PC12       46806 non-null  float64
 13  PC13       46806 non-null  float64
 14  PC14       46806 non-null  float64
 15  PC15       46806 non-null  float64
 16  PC16       46806 non-null  float64
 17  PC17       46806 non-null  float64
 18  PC18       46806 non-null  float6

# select the number of ACP to keep

In [25]:
keep_acp = 15
acp_filter = list(range(0, keep_acp + 1))
df_iris = df_iris.iloc[:,acp_filter]
print (df_iris.info())
display(df_iris)

<class 'pandas.core.frame.DataFrame'>
Index: 46806 entries, 0 to 46805
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CODE_IRIS  46806 non-null  int64  
 1   PC1        46806 non-null  float64
 2   PC2        46806 non-null  float64
 3   PC3        46806 non-null  float64
 4   PC4        46806 non-null  float64
 5   PC5        46806 non-null  float64
 6   PC6        46806 non-null  float64
 7   PC7        46806 non-null  float64
 8   PC8        46806 non-null  float64
 9   PC9        46806 non-null  float64
 10  PC10       46806 non-null  float64
 11  PC11       46806 non-null  float64
 12  PC12       46806 non-null  float64
 13  PC13       46806 non-null  float64
 14  PC14       46806 non-null  float64
 15  PC15       46806 non-null  float64
dtypes: float64(15), int64(1)
memory usage: 6.1 MB
None


Unnamed: 0,CODE_IRIS,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,10010000,-2.36,2.29,-0.27,0.52,-1.03,-0.27,0.03,0.14,-0.08,0.27,-0.61,-0.43,0.23,-0.36,1.04
1,10020000,-3.90,1.27,2.35,-0.27,0.56,0.14,0.57,1.23,1.24,-0.55,1.08,-0.61,0.47,-1.22,-0.56
2,10040101,6.94,0.55,-0.85,-0.53,1.25,0.61,-2.41,-1.14,-0.77,1.41,-0.92,-1.17,-0.72,0.75,0.50
3,10040102,6.95,-1.30,-3.22,-0.48,0.95,0.45,-0.73,-1.56,1.22,0.15,-1.03,-1.21,-0.06,-0.33,0.84
4,10040201,6.34,-0.10,-2.81,0.64,2.00,0.87,-1.52,-0.80,1.09,-0.22,-1.27,-1.67,-0.71,-0.31,0.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46801,956800112,8.56,-3.51,-4.11,-3.73,-1.08,-2.18,3.42,0.77,0.38,-0.13,0.90,1.08,1.23,-0.16,0.50
46802,956800113,8.99,-0.71,-4.90,-5.23,2.63,-1.20,2.73,-0.97,1.11,-1.16,1.65,0.31,1.21,-0.76,-0.02
46803,956800114,9.57,-4.39,-4.55,-4.01,-1.36,-2.74,4.11,0.91,1.06,-0.73,0.28,0.88,0.53,-0.20,0.31
46804,956820000,-0.89,2.98,0.71,-2.13,-0.34,-0.26,-0.35,2.11,-0.38,-0.62,-1.12,-0.58,0.05,0.31,-0.78


# Merge Data and ACP

In [26]:
df=df_processed.merge(df_iris,how='left',on='CODE_IRIS')
df.info(show_counts=True)
print(df.shape)
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077476 entries, 0 to 2077475
Data columns (total 63 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   etage                    2077476 non-null  int64         
 1   surface                  2077476 non-null  int64         
 2   nb_pieces                2077476 non-null  int64         
 3   prix_bien                2077476 non-null  int64         
 4   balcon                   2077476 non-null  int64         
 5   eau                      2077476 non-null  int64         
 6   bain                     2077476 non-null  int64         
 7   mapCoordonneesLatitude   2077476 non-null  float64       
 8   mapCoordonneesLongitude  2077476 non-null  float64       
 9   annonce_exclusive        2077476 non-null  int64         
 10  nb_etages                2077476 non-null  float64       
 11  places_parking           2077476 non-null  float64       
 12  

Unnamed: 0,etage,surface,nb_pieces,prix_bien,balcon,eau,bain,mapCoordonneesLatitude,mapCoordonneesLongitude,annonce_exclusive,...,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,0,65,2,136000,0,0,0,46.25701,5.22967,1,...,1.37,-0.04,0.03,1.43,0.64,-0.42,0.58,0.38,-0.29,0.02
1,1,72,3,369000,0,0,1,46.18916,5.91436,0,...,0.20,1.27,-0.32,2.05,-0.12,-0.64,-1.53,-0.38,-0.78,0.66
2,2,128,5,440000,1,1,1,46.19379,5.92232,0,...,0.20,1.27,-0.32,2.05,-0.12,-0.64,-1.53,-0.38,-0.78,0.66
3,0,76,4,170000,0,1,0,46.21553,5.20806,0,...,0.84,-0.38,-1.52,1.37,0.22,-0.13,-0.27,-0.31,-0.13,0.10
4,0,84,3,449000,0,0,0,46.26402,6.11517,1,...,-1.68,2.92,-1.12,0.04,0.92,0.24,-2.29,-0.61,0.18,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077471,1,59,2,202419,0,1,0,43.95000,4.81667,2,...,1.62,-3.34,0.06,-0.37,0.23,0.33,-0.03,0.14,0.56,0.58
2077472,2,36,2,110000,0,1,0,43.95126,4.81444,2,...,1.62,-3.34,0.06,-0.37,0.23,0.33,-0.03,0.14,0.56,0.58
2077473,1,97,4,169500,0,0,1,43.93732,4.80531,2,...,1.54,-0.92,1.60,1.23,-0.49,-0.00,-0.03,-0.70,-0.09,0.25
2077474,1,132,5,315000,0,0,0,43.93946,4.80905,2,...,1.54,-0.92,1.60,1.23,-0.49,-0.00,-0.03,-0.70,-0.09,0.25


# 1st Model Encoding

In [27]:
# check dtypes
display(df.dtypes)
print("porte_digicode",df.porte_digicode.unique())
print("ascenceur",df.ascenseur.unique())
print("cave",df.ascenseur.unique())

etage          int64
surface        int64
nb_pieces      int64
prix_bien      int64
balcon         int64
              ...   
PC11         float64
PC12         float64
PC13         float64
PC14         float64
PC15         float64
Length: 63, dtype: object

porte_digicode [0 1]
ascenceur [0 1]
cave [0 1]


# # Preprocessing

In [28]:
# set target and data
print (df.columns)
X,y = apply_preprocessing(df)

Index(['etage', 'surface', 'nb_pieces', 'prix_bien', 'balcon', 'eau', 'bain',
       'mapCoordonneesLatitude', 'mapCoordonneesLongitude',
       'annonce_exclusive', 'nb_etages', 'places_parking', 'cave',
       'annee_construction', 'nb_toilettes', 'nb_terraces', 'videophone',
       'porte_digicode', 'ascenseur', 'nb_logements_copro', 'charges_copro',
       'logement_neuf', 'date', 'CODE_IRIS', 'REG', 'DEP', 'prix_m2_vente',
       'dpeL_num', 'ges_class_num', 'chauf_energy_gaz', 'chauf_energy_elec',
       'chauf_energy_bois', 'chauf_energy_fioul', 'chauf_sys_climatisation',
       'chauf_sys_pompe_chaleur', 'chauf_sys_convecteur',
       'chauf_sys_radiateur', 'chauf_sys_chaudiere', 'chauf_sys_sol',
       'chauf_sys_poele_bois', 'chauf_mode_individuel', 'chauf_mode_collectif',
       'chauf_mode_central', 'expo_has_nord', 'expo_has_sud', 'expo_has_est',
       'expo_has_ouest', 'expo_double', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
       'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'P

#  # create train test dataset

In [29]:
#  train test split
from sklearn.model_selection import train_test_split
#  build the train and test data

X_train, X_test, y_train, y_test = create_train_test_data(X,y)

(2077476, 57)


In [30]:
# apply min max scaler
from sklearn.preprocessing import MinMaxScaler

# X_train_scaled,X_test_scaled=apply_scale_processing(MinMaxScaler(),X_train,X_test)

# # train ans predict models

In [31]:
def fit_train_models (models,X_train, y_train,X_test,y_test) :
    results = []
    for name, model in models.items():
        print(f"Entraînement de {name}...")
        
        # Mesurer le temps d'entraînement
        start_time = time.time()
        model.fit(X_train, y_train)
        fit_time = time.time() - start_time
        
        # Mesurer le temps de prédiction
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time
        
        # Calculer les métriques
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R²': r2,
            'Fit_Time': f"{fit_time:.4f}s",
            'Predict_Time': f"{predict_time:.4f}s",
            'Total_Time': f"{fit_time + predict_time:.4f}s"
        })

    # Afficher les résultats
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('RMSE')


    return results_df


In [32]:

import lightgbm as lgb
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression, Ridge
from sklearn.svm import SVR, LinearSVR

# Liste des modèles à tester
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'ElasticNetCV': ElasticNetCV(),
    'LinearSVR': LinearSVR(),
    # 'SVR': SVR(),
    # 'Random Forest': RandomForestRegressor(n_estimators=20, random_state=42),
    'lgb' : lgb.LGBMRegressor(
                            objective='regression',
                            metric='rmse',
                            num_leaves=31,
                            learning_rate=0.05,
                            force_row_wise=True,
                            # feature_fraction=0.9,
                        )
}

# Tester chaque modèle avec mesure du temps
if lgbRegressionON :
    results = fit_train_models(models,X_train_scaled,y_train,X_test_scaled,y_test)

    # Afficher les résultats
    print(results)


In [33]:



# Liste des modèles à tester
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


models = {
    # 'KNeighborsRegressor': KNeighborsRegressor(n_neighbors=5),
    'XGBRegressor' : XGBRegressor(n_estimators=100, random_state=42),
    'DecisionTreeRegressor': DecisionTreeRegressor(max_depth=10, random_state=42),
    # 'GradientBoostingRegressor': GradientBoostingRegressor(n_estimators=20, random_state=42),
    # 'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
}

# Tester chaque modèle avec mesure du temps

if XGBRegressorON :
    results = fit_train_models(models,X_train,y_train,X_test,y_test)

    # Afficher les résultats
    print(results)


#  # improve hyper parameters 

In [34]:
from sklearn.preprocessing import MinMaxScaler

# X1,y1 = apply_preprocessing(df)
X_train, X_test, y_train,y_test = create_train_test_data_subset(X,y,0.25)


(2077476, 57)


In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  mean_absolute_error, mean_squared_error

def optimize_hyperparameters_grid_search (estimator_name, estimator, param_grid,X_train, y_train,X_test,y_test) :
# Initialize XGBRegressor

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='neg_mean_squared_error',  # or 'r2', 'neg_mean_absolute_error'
        verbose=2,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit the grid search
    print(f"Starting GridSearchCV {estimator_name} fit ...")
    grid_search.fit(X_train, y_train)
    print(f"Ending GridSearchCV {estimator_name} fit ...")

    # Best parameters and score
    print(f"Best estimator : {estimator_name} {grid_search.best_estimator_}")
    print(f"Best paramters : {estimator_name} {grid_search.best_params_}")
    print(f"Best score : {estimator_name} {grid_search.best_params_}")

    print("\n" + "="*60)
    print("BEST PARAMETERS:")
    print("="*60)
    for param, value in grid_search.best_params_.items():
        print(f"{param:20s}: {value}")

    print(f"\nBest CV Score (neg MSE): {grid_search.best_score_:.4f}")
    print(f"Best CV RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

    # Train best model on full training set and evaluate
    print(f"Starting GridSearchCV {estimator_name} predict ...")
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n" + "="*60)
    print("TEST SET PERFORMANCE:")
    print("="*60)
    print(f"MSE:  {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R²:   {r2:.4f}")

    # Display top 10 parameter combinations
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df['mean_rmse'] = np.sqrt(-results_df['mean_test_score'])
    top_10 = results_df.nsmallest(10, 'mean_rmse')[['params', 'mean_rmse', 'std_test_score']]
    print("\n" + "="*60)
    print("TOP 10 PARAMETER COMBINATIONS:")
    print("="*60)
    print(top_10.to_string(index=False))

    # Feature importance plot
    feature_importance = best_model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance)), feature_importance)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Index')
    plt.title('Feature Importance from Best XGBRegressor Model')
    plt.tight_layout()

    plt.show()
   

    # Save the best model
    # import joblib
    # joblib.dump(best_model, 'best_xgb_model.pkl')
    # print("Best model saved as 'best_xgb_model.pkl'")

#  # hyper parameters XgbRegressor

In [19]:

from xgboost import XGBRegressor


# Define parameter grid
param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}
param_grid1 = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_lambda': [0.1, 1.0, 10.0],
    'reg_alpha': [0, 0.1, 1.0]
}

param_grid2 = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

param_grids = [{'max_depth': [3, 5, 7]},
               {'learning_rate': [0.01, 0.05, 0.1],},
               {'n_estimators': [100, 200, 300]},
               {'subsample': [0.6, 0.8, 1.0]},
               {'colsample_bytree': [0.6, 0.8, 1.0]},
               {'gamma': [0, 0.1, 0.3]}]

# Initialize XGBRegressor

if GridSearchXGBRegressor :
    
    for param_grid in param_grids :
        xgb = XGBRegressor(random_state=42, n_jobs=-1)
        optimize_hyperparameters_grid_search("XGBRegressor", xgb,param_grid,X_train, y_train,X_test,y_test)


#  # hyper parameters DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd


# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'splitter': ['best', 'random'],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}
param_grids = [{'max_depth': [3, 5, 7,10]},
               {'min_samples_split': [2, 5, 10, 20]},
               {'min_samples_leaf': [1, 2, 4, 8]},
               {'max_features': ['sqrt', 'log2', None]},
               {'splitter': ['best', 'random']},
               {'criterion': ['squared_error', 'friedman_mse', 'absolute_error']}]

# Initialize model
if GridSearchDecisionTree :
    for param_grid in param_grids :
        dt = DecisionTreeRegressor(random_state=42)
        optimize_hyperparameters_grid_search("DecisionTreeRegressor", dt,param_grid,X_train, y_train,X_test,y_test)



SyntaxError: invalid syntax (3555737187.py, line 18)