In [1]:
#  import main packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# define variables
immo_vis_dir = "../../data/immo_vis/"
parquet_extension = ".parquet"
metropole_appartement_file = "ventes-metropole-appartement" + parquet_extension
metropole_maison_file = "ventes-metropole-maison" + parquet_extension
metropole_appartement_file_cleaned = metropole_appartement_file + "_step1_clean_" + parquet_extension

dpe_ranking     = ['0','A','B','C','D','E','F','F/G','G','NS','Unknown']
dpe_ranking_num = [10,1,2,3,4,5,6,6.5,7,9,10]

#  Set plot options
LargePlotActive = False
MediumPlotActive = False
SmallPlotActive = False
HeatMapDisplay = False



# # functions

In [2]:
# functions  

def print_numerical_isna(df) :
    columns = df.select_dtypes(include='number').columns.tolist()
    print (df[columns].isna().sum())

def print_categorial_isna(df) :
    columns = df.select_dtypes(include='category').columns.tolist()
    print (df[columns].isna().sum())     

def load_appartement_file (filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    return pd.read_parquet(final_path.as_posix())

def save_appartement_file (df, filename) :
    start_path = Path(immo_vis_dir)
    final_path = start_path / filename
    df.to_parquet(path=final_path.as_posix(),index=True)

def get_numerical_column_names (df) :
    return [ column for column  in df.columns if df[column].dtype not in ["object","category"]]

def apply_processing(operator,X_train,X_test):
    #Operator could be scaler or encode
    op=operator
    op.fit(X_train)
    X_train_treated=pd.DataFrame(op.transform(X_train),columns=X_train.columns)
    X_test_treated=pd.DataFrame(op.transform(X_test),columns=X_train.columns)
    return X_train_treated,X_test_treated


# # decode file and display infos

In [3]:
# import appartement file

df = load_appartement_file(metropole_appartement_file_cleaned)
nb_rows= df.shape[0]
nb_cols= df.shape[1]

In [4]:
#  print main infos on appartmeent file


print ( f" {metropole_appartement_file_cleaned} rows {nb_rows} columns {nb_cols}")
print (df.info())



 ventes-metropole-appartement.parquet_step1_clean_.parquet rows 2077476 columns 44
<class 'pandas.core.frame.DataFrame'>
Index: 2077476 entries, immo-facile-57743459 to 134327817
Data columns (total 44 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   etage                      int64  
 1   surface                    int64  
 2   nb_pieces                  int64  
 3   prix_bien                  int64  
 4   balcon                     int64  
 5   eau                        int64  
 6   bain                       int64  
 7   mapCoordonneesLatitude     float64
 8   mapCoordonneesLongitude    float64
 9   annonce_exclusive          int64  
 10  nb_etages                  float64
 11  places_parking             float64
 12  cave                       int64  
 13  annee_construction         float64
 14  nb_toilettes               int64  
 15  nb_terraces                float64
 16  porte_digicode             int64  
 17  ascenseur              

# 1st Model Encoding

In [5]:
# check dtypes
display(df.dtypes)
print("porte_digicode",df.porte_digicode.unique())
print("ascenceur",df.ascenseur.unique())
print("cave",df.ascenseur.unique())

etage                          int64
surface                        int64
nb_pieces                      int64
prix_bien                      int64
balcon                         int64
eau                            int64
bain                           int64
mapCoordonneesLatitude       float64
mapCoordonneesLongitude      float64
annonce_exclusive              int64
nb_etages                    float64
places_parking               float64
cave                           int64
annee_construction           float64
nb_toilettes                   int64
nb_terraces                  float64
porte_digicode                 int64
ascenseur                      int64
nb_logements_copro           float64
charges_copro                float64
logement_neuf                float64
CODE_IRIS                      int64
DEP                            int64
prix_m2_vente                float64
dpeL_num                     float64
ges_class_num                float64
chauf_energy_gaz               int64
c

porte_digicode [0 1]
ascenceur [0 1]
cave [0 1]


In [6]:
# set target and data
print (df.columns)
y=df['prix_bien']
X=df.drop(columns=['prix_m2_vente','prix_bien','mapCoordonneesLatitude','mapCoordonneesLongitude'])

Index(['etage', 'surface', 'nb_pieces', 'prix_bien', 'balcon', 'eau', 'bain',
       'mapCoordonneesLatitude', 'mapCoordonneesLongitude',
       'annonce_exclusive', 'nb_etages', 'places_parking', 'cave',
       'annee_construction', 'nb_toilettes', 'nb_terraces', 'porte_digicode',
       'ascenseur', 'nb_logements_copro', 'charges_copro', 'logement_neuf',
       'CODE_IRIS', 'DEP', 'prix_m2_vente', 'dpeL_num', 'ges_class_num',
       'chauf_energy_gaz', 'chauf_energy_elec', 'chauf_energy_bois',
       'chauf_energy_fioul', 'chauf_sys_climatisation',
       'chauf_sys_pompe-chaleur', 'chauf_sys_convecteur',
       'chauf_sys_radiateur', 'chauf_sys_chaudiere', 'chauf_sys_sol',
       'chauf_sys_poele-bois', 'chauffage_mode_individuel',
       'chauffage_mode_collectif', 'chauffage_mode_central', 'expo_has_nord',
       'expo_has_sud', 'expo_has_est', 'expo_has_ouest'],
      dtype='object')


In [7]:
#  train test split
from sklearn.model_selection import train_test_split
#  build the train and test data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
# apply min max scaler
from sklearn.preprocessing import MinMaxScaler

X_train_scaled,X_test_scaled=apply_processing(MinMaxScaler(),X_train,X_test)

In [9]:

import time
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.linear_model import ElasticNetCV, LassoCV
from sklearn.metrics import root_mean_squared_error
from sklearn.svm import LinearSVR

# Liste des modèles à tester
models = {
    # 'LassoCV': LassoCV(),
    'ElasticNetCV': ElasticNetCV(),
    #'RandomForestRegressor': RandomForestRegressor(),
    'LinearSVR': LinearSVR(),
    'lgb' : lgb.LGBMRegressor(
                            objective='regression',
                            metric='rmse',
                            num_leaves=31,
                            learning_rate=0.05,
                            feature_fraction=0.9,
                        )
}

# Tester chaque modèle avec mesure du temps
results = []
for name, model in models.items():
    print(f"Entraînement de {name}...")
    
    # Mesurer le temps d'entraînement
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    fit_time = time.time() - start_time
    
    # Mesurer le temps de prédiction
    start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    predict_time = time.time() - start_time
    
    # Calculer les métriques
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'RMSE': rmse,
        'R²': r2,
        'Fit_Time': f"{fit_time:.4f}s",
        'Predict_Time': f"{predict_time:.4f}s",
        'Total_Time': f"{fit_time + predict_time:.4f}s"
    })

# Afficher les résultats
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('RMSE')
print(results_df)


Entraînement de ElasticNetCV...
Entraînement de LinearSVR...
Entraînement de lgb...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 1661980, number of used features: 38
[LightGBM] [Info] Start training from score 305724.749736
          Model           RMSE        R²  Fit_Time Predict_Time Total_Time
2           lgb  130176.688743  0.782803   3.6704s      0.3076s    3.9780s
1     LinearSVR  262345.916051  0.117862   3.7550s      0.0211s    3.7761s
0  ElasticNetCV  278916.373204  0.002906  14.4736s      0.0478s   14.5215s
