# Import package and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, MaxAbsScaler
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
DATA_PATH = "../data/"
data = pd.read_csv(DATA_PATH+"2016_Building_Energy_Benchmarking.csv", sep=",", encoding="iso-8859-1")

# Phase 1

## Cleaning

In [None]:
pd.set_option('display.max_info_rows',50)
data.info()

In [None]:
data.isna().mean()[data.isna().mean() > 0.10]

In [None]:
label_1 = "TotalGHGEmissions"
label_2 = "SiteEnergyUse(kBtu)"

In [None]:
all_columns = list(data.columns)
categorical_columns = ["BuildingType", "PrimaryPropertyType","PropertyName","City","State","Address","City","State",
                       "ZipCode","CouncilDistrictCode","Neighborhood","ListOfAllPropertyUseTypes",
                       "LargestPropertyUseType","SecondLargestPropertyUseType","ThirdLargestPropertyUseType","ComplianceStatus"]
numerical_columns = [column for column in all_columns if column not in categorical_columns]

In [None]:
# Premier nettoyage du dataframe
def clean_data(df, to_drop):
    print("--------- shape before cleaning ---------")
    print(df.shape)
    df = df[df.ComplianceStatus == "Compliant"]
    df.Outlier.fillna('not_outlier', inplace=True)
    df.SecondLargestPropertyUseType.fillna('one_use', inplace=True)
    df.ThirdLargestPropertyUseType.fillna('one_use', inplace=True)
    df = df[df.NumberofBuildings != 0]
    df = df[df.NumberofFloors != 0]
    df = df[df['DefaultData'] == False]
    df.loc[df['SecondLargestPropertyUseType'] == 'one_use', 'SecondLargestPropertyUseTypeGFA'] = 0
    df.loc[df['ThirdLargestPropertyUseType'] == 'one_use', 'ThirdLargestPropertyUseTypeGFA'] = 0
    df['Nb_PropertyUseTypes'] = df['ListOfAllPropertyUseTypes'].str.count(',') + 1
    df["Building_age"] = df["DataYear"] - df["YearBuilt"]
    df['LargestPropertyUseType'] = df['LargestPropertyUseType'].fillna(df['PrimaryPropertyType'])
    df['LargestPropertyUseTypeGFA'] = df['LargestPropertyUseTypeGFA'].fillna(df['PropertyGFATotal'])
    df["%_LargestPropertyUseType"] = df["LargestPropertyUseTypeGFA"] / df["PropertyGFATotal"]    
    for col in to_drop:
        if col in df.columns:
            df = df.drop(columns=[col])
    print("--------- shape before cleaning ---------")
    print(df.shape)
    return df

# Mise à jour des listes des colonnes
def update_columns_list(list_1, dropped):
    new_list = [col for col in list_1 if col not in dropped]
    return new_list

In [None]:
# Vérifier la GFA
data["GFATotal"] = data["PropertyGFAParking"] + data["PropertyGFABuilding(s)"]
check_GPA = pd.Series(data["GFATotal"] == data["PropertyGFATotal"])
data.drop(columns=["GFATotal"], inplace=True)
check_GPA.value_counts()

In [None]:
data = data[~data.BuildingType.isin(["Multifamily LR (1-4)","Multifamily MR (5-9)","Multifamily HR (10+)"])]
data["%_GFAParking"] = data["PropertyGFAParking"] / data["PropertyGFATotal"]
data["%_GFABuilding"] = data["PropertyGFABuilding(s)"] / data["PropertyGFATotal"]


In [None]:
columns_to_drop = ["DataYear","PropertyName","Address","City","State",
                "Latitude","Longitude","YearBuilt","TaxParcelIdentificationNumber",
                "YearsENERGYSTARCertified","Comments","DefaultData","ComplianceStatus",
                "SiteEnergyUseWN(kBtu)","PropertyGFAParking","PropertyGFABuilding(s)","ListOfAllPropertyUseTypes"]

In [None]:
data = clean_data(data, columns_to_drop)
categorical_columns = update_columns_list(categorical_columns,columns_to_drop)

In [None]:
data.BuildingType.value_counts()

In [None]:
pd.set_option('display.max_columns',50)
data.head()

In [None]:
data.ENERGYSTARScore = data.ENERGYSTARScore.fillna('no_score')
data = data[data["ENERGYSTARScore"] != 'no_score']

In [None]:
features = ['BuildingType', 'PrimaryPropertyType', 'ZipCode',
       'Neighborhood',  'Building_age','NumberofBuildings',
       'NumberofFloors', 'PropertyGFATotal', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA',  '%_LargestPropertyUseType',
       'SourceEUI(kBtu/sf)',
       'SiteEnergyUse(kBtu)', 'SteamUse(kBtu)','Electricity(kBtu)','NaturalGas(kBtu)',
       'TotalGHGEmissions',
       '%_GFAParking', '%_GFABuilding','Nb_PropertyUseTypes']

In [None]:
categorical_columns = update_columns_list(categorical_columns,columns_to_drop)
numerical_columns = update_columns_list(numerical_columns,columns_to_drop)

In [None]:
df = data[features]
cat_features = [col for col in features if col in categorical_columns]
num_features = [col for col in features if col in categorical_columns]

## Exploration label

In [None]:
plt.hist(df["TotalGHGEmissions"]);

In [None]:
df["TotalGHGEmissions"].describe()

In [None]:
df["TotalGHGEmissions"].quantile([0, .5, 0.98])

In [None]:
df = df[df["TotalGHGEmissions"] <= df["TotalGHGEmissions"].quantile([0.98])[0.98]]
df = df[df["TotalGHGEmissions"] > 0]

In [None]:
plt.hist(df[label_2]);

In [None]:
df[label_1].describe()

In [None]:
df[label_2].describe()

## Categorical features

In [None]:
df.BuildingType.nunique(), df.PrimaryPropertyType.nunique(), df.ZipCode.nunique(), df.Neighborhood.nunique(), df.LargestPropertyUseType.nunique()

In [None]:
def pipe_ohe(df):
    
    #### encoder la colonne LargestPropertyUseType ###
    # Ajouter les colonnes encodées au dataframe d'origine
    # Supprimer la colonne originale
    onehot_encoded = pd.get_dummies(df['LargestPropertyUseType'], prefix='LargestPropertyUseType')
    df = pd.concat([df, onehot_encoded], axis=1)
    df.drop('LargestPropertyUseType', axis=1, inplace=True)
    
    #### encoder la colonne BuildingType ###
    # Ajouter les colonnes encodées au dataframe d'origine
    # Supprimer la colonne originale
    onehot_encoded_BuildingType = pd.get_dummies(df['BuildingType'], prefix='BuildingType')
    df = pd.concat([df, onehot_encoded_BuildingType], axis=1)
    df.drop('BuildingType', axis=1, inplace=True)
    
    ### Remplacer les valeurs 1/0 de LargestPropertyUseType_ par leur % de la surface totale
    # Liste des colonnes à remplacer
    # Boucle pour remplacer les valeurs dans chaque colonne
    cols_to_replace = list(df.filter(like="LargestPropertyUseType_").columns)
    for col in cols_to_replace:
        df[col] = df.apply(lambda row: row["%_LargestPropertyUseType"] if row[col] == 1 else 0, axis=1)    
    df.drop(columns=["%_LargestPropertyUseType"], inplace=True)
    
    return df
    

In [None]:
X = pipe_ohe(df)
y1 = df[[label_1]]
y2 = df[[label_2]]

X_enrg_train, X_enrg_test, y_nrg_train, y_nrg_test = train_test_split(X,y1,test_size=0.15, random_state=42, stratify=df["BuildingType"])
print(X_enrg_train.shape, X_enrg_test.shape)

# séparer la donnée pour avoir les 
X_ghge_train, X_ghge_test, y_ghe_train, y_ghe_test = train_test_split(X,y2,test_size=0.15, random_state=42, stratify=df["BuildingType"])
print(X_ghge_train.shape, X_ghge_test.shape)

In [None]:
index_X_enrg_test = X_enrg_test.index
df_nrg_selection = df.loc[index_X_enrg_test]

index_X_ghge_test = X_ghge_test.index
df_ghe_selection = df.loc[index_X_ghge_test]

In [None]:
def pipe(df, means_zipcode=None, 
                means_neighborhood=None,
                means_ppropertype=None, 
                col_for_mean="PropertyGFATotal", 
                scaler=None,
                label_1=label_1,
                label_2=label_2):
    #### encoder la colonne ZipCode ###
    # encoding ZipCode with mean encoding and TotalGHGEmissions #
    # Calculer la moyenne de la colonne cible (label2) pour chaque code postal
    # Encoder la colonne code postal en fonction de la moyenne de la colonne cible
    # Supprimer la colonne originale
    mean_encoding_nan = df[col_for_mean].mean()
    if means_zipcode is None:
        means_zipcode = df.groupby('ZipCode')[col_for_mean].mean()
    df['Code_postal_encoded'] = df['ZipCode'].map(means_zipcode)
    df['Code_postal_encoded'].fillna(mean_encoding_nan, inplace=True)
    
    ### encoder la colonne Neighborhood ###
    # Calculer la moyenne de la colonne cible (label2) pour chaque quartier
    # Encoder la colonne quartier en fonction de la moyenne de la colonne cible
    # Supprimer la colonne originale
    if means_neighborhood is None:
        means_neighborhood = df.groupby('Neighborhood')[col_for_mean].mean()
    df['neighborhood_encoded'] = df['Neighborhood'].map(means_neighborhood)
    df.drop(columns=['Neighborhood'],inplace=True)
    df['neighborhood_encoded'].fillna(mean_encoding_nan, inplace=True)
    
    
    ### encoder la colonne PrimaryPropertyType with mean encoding  ###
    # Calculer la moyenne de la colonne cible (label2) pour chaque PrimaryPropertyType
    # Encoder la colonne PrimaryPropertyType en fonction de la moyenne de la colonne cible
    # Supprimer la colonne catégorielle
    if means_ppropertype is None:
        means_ppropertype = df.groupby('PrimaryPropertyType')[col_for_mean].mean()
    df['PrimaryPropertyType_encoded'] = df['PrimaryPropertyType'].map(means_ppropertype)
    df.drop(columns=['PrimaryPropertyType'],inplace=True)
    df['PrimaryPropertyType_encoded'].fillna(mean_encoding_nan, inplace=True)
    
    #supprimer les labels du jeu de données
    df = df.drop(columns=[label_1,label_2])
    df.drop(columns=['SourceEUI(kBtu/sf)', 'SteamUse(kBtu)','Electricity(kBtu)', 'NaturalGas(kBtu)'], inplace=True)
    
    #scaler
    # if scaler is None:
    #     scaler=StandardScaler()
    #     df_scaled = scaler.fit_transform(df)
    #     print("check scaler standard scaler")
    # else:
    #     df_scaled = scaler.transform(df)
    #     print("check scaler the train scaler")
    
    return df, means_zipcode, means_neighborhood, means_ppropertype, df.columns
    

In [None]:
df.columns

In [None]:
X_nrg_train, means_zipcode_enrg, means_neighborhood_enrg, means_ppropertype_enrg, columns_enrg = pipe(X_enrg_train,col_for_mean=label_1)
X_nrg_test, means_zipcode_enrg, means_neighborhood_enrg, means_ppropertype_enrg, a = pipe(X_enrg_test,
                                                                        means_zipcode=means_zipcode_enrg,
                                                                        means_neighborhood=means_neighborhood_enrg,
                                                                        means_ppropertype=means_ppropertype_enrg,
                                                                        col_for_mean=label_1)

In [None]:
X_ghe_train, means_zipcode_ghe, means_neighborhood_ghe, means_ppropertype_ghe, columns_ghe = pipe(X_ghge_train,col_for_mean=label_2)
X_ghe_test, means_zipcode_ghe, means_neighborhood_ghe, means_ppropertype_ghe, b = pipe(X_ghge_test, 
                                                                        means_zipcode=means_zipcode_ghe,
                                                                        means_neighborhood=means_neighborhood_ghe,
                                                                        means_ppropertype=means_ppropertype_ghe,
                                                                        col_for_mean=label_2)

In [None]:
print(X_nrg_train.shape, X_nrg_test.shape), 
print(X_ghe_train.shape, X_ghe_test.shape)

## Modèles

###  Results functions

In [None]:
def results_printed(model):
    # Best score                     
    print(f"Best score : {model.best_score_}")
    
    # Best Params
    print(f"Best params :\n {model.best_params_}")

In [None]:
metrics_df_nrg = pd.DataFrame(columns=["label","modèle","r2_train","r2_test","mae","rmse","mean_fit_time","best_params"])
metrics_df_ghe = pd.DataFrame(columns=["label","modèle","r2_train","r2_test","mae","rmse","mean_fit_time","best_params"])

def add_metrics_table(label,y_pred,y_test,df,model, X_train, y_train):
    r2_train = round(model.score(X_train, y_train,),4)
    r2_test = round(r2_score(y_test, y_pred), 4)
    mae = round(mean_absolute_error(y_test, y_pred), 4)
    rmse = round(mean_squared_error(y_test, y_pred), 4)
    df = df.append({
    "label": label,
    "modèle": model.best_estimator_,
    "r2_train":'{:.3f}'.format(r2_train),
    "r2_test": '{:.3f}'.format(r2_test),
    "mae": '{:.3f}'.format(mae),
    "rmse": '{:.3f}'.format(rmse),
    "mean_fit_time": model.cv_results_['mean_fit_time'].mean(),
    "best_params": model.best_params_
    }, ignore_index=True)
    return df

scoring = ["r2", "neg_mean_absolute_error", "neg_root_mean_squared_error"]

def evaluate_prediction(label, X_test, y_test, metrics_df, model, X_train, y_train):
    y_pred = model.predict(X_test)

    metrics_df = add_metrics_table(label, y_pred, y_test, metrics_df, model, X_train, y_train)

    # Créer le scatter plot
    plt.scatter(y_pred, y_test)

    # Ajouter une ligne diagonale pour représenter la ligne de prédiction parfaite
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)

    # Ajouter des labels pour les axes et le titre du plot
    plt.xlabel('Prédictions')
    plt.ylabel('Valeurs réelles')
    plt.title('Comparaison des prédictions et des valeurs réelles')

    # Afficher le plot
    plt.show()
    display(metrics_df)
    return metrics_df, y_pred

In [None]:
def hist_errors(y_test, y_pred):
    y_test = np.array(y_test).reshape(-1,)
    diff = y_test - y_pred
    mean = diff.mean()
    std = diff.std()
    plt.hist(diff, bins=40)
    plt.vlines(mean, 0, 30, color='red', label=f'mean = {mean:.2f}')
    plt.hlines(30, mean - 1/2 * std, mean + 1/2 * std, color='red', label=f'std = {std:.2f}', ls='dotted')
    plt.title('Histogram of prediction errors')
    plt.xlabel('prediction error')
    plt.ylabel('Frequency')
    
    # Showing the legend
    plt.legend(loc='upper right')
    plt.show()

In [None]:
def show_fetures_coef(model, columns):
    coef = model.best_estimator_.coef_
    if coef.ndim == 2 and coef.shape[0] == 1:
        coef = coef[0]
    features_coef = pd.DataFrame(coef.reshape(1,-1), columns=list(columns))
    columns = list(reversed(features_coef.columns))
    coef = list(reversed(coef))
    
    
    plt.figure(figsize=(14,8))
    # Créer un graphique à barres horizontales
    plt.barh(columns, coef)

    # Récupérer les étiquettes de l'axe des abscisses
    labels = plt.gca().get_yticklabels()

    # Parcourir les étiquettes et appliquer un style différent aux étiquettes non nulles
    for label, value in zip(labels, coef):
        if value != -0:
            label.set_color('red')  # Appliquer une couleur rouge aux étiquettes non nulles
        else:
            label.set_color('black')  # Appliquer une couleur noire aux étiquettes nulles

    plt.show()

In [None]:
def error_per_category(df, y_test, y_pred, feature):
    df["error_pred"] = (np.array(y_test).reshape(-1,) - y_pred)
    df_grouped = df.groupby(feature).mean()['error_pred']
    counts = df[feature].value_counts()

    # Créer le graphique montrant la moyenne des erreurs et le nombre d'occurrences
    fig, ax = plt.subplots()

    # Afficher les barres de la moyenne des erreurs
    ax.bar(df_grouped.index, df_grouped.values)

    # Ajouter le nombre d'occurrences sur les barres de la moyenne des erreurs
    for i, v in enumerate(df_grouped.values):
        ax.text(i, v, str(counts[df_grouped.index[i]]), ha='center', va='bottom')

    # Ajouter une légende
    ax.text(0.05, -0.2, f'{counts[df_grouped.index[i]]}:Nombre d\'occurrences dans le jeu de test', transform=ax.transAxes)

    ax.set_xlabel(feature)
    ax.set_ylabel('Moyenne des erreurs')

    # Changer la taille de la figure
    fig.set_size_inches(8, 4)
    
    # Ajouter un espace supplémentaire de 10% au-dessus et en dessous des barres du graphique
    ax.margins(y=0.1)
    
    plt.show()

In [None]:
def error_all_category(df, y_test, y_pred, features):
    # Créer une figure avec des subplots
    fig, axs = plt.subplots(nrows=len(features), ncols=1, figsize=(8, 4 * len(features)), gridspec_kw={'hspace': 0.6})
    axs[0].set_ylabel('Moyenne des erreurs')

    # Boucle sur les features
    for i, feature in enumerate(features):
        # Récupérer l'axe courant
        ax = axs[i]

        # Calculer les erreurs moyennes pour chaque catégorie
        df["error_pred"] = abs((np.array(y_test).reshape(-1,) - y_pred))
        df_grouped = df.groupby(feature).mean()['error_pred']
        counts = df_nrg_selection[feature].value_counts()

        # Afficher les barres de la moyenne des erreurs
        ax.bar(df_grouped.index, df_grouped.values)

        # Ajouter le nombre d'occurrences sur les barres de la moyenne des erreurs
        for i, v in enumerate(df_grouped.values):
            ax.text(i, v, str(counts[df_grouped.index[i]]), ha='center', va='bottom')

        # Ajouter un titre pour chaque subplot
        ax.set_title(feature)

        # Ajouter un espace supplémentaire de 10% au-dessus et en dessous des barres du graphique
        ax.margins(y=0.1)
        
        # Ajouter des étiquettes d'axe plus petites et penchées
        ax.set_xticklabels(df_grouped.index, rotation=45, fontsize=7)

    # Ajouter un titre pour le graphique global
    fig.suptitle('Erreurs en fonction des variables catégorielles', y=0.95, fontsize=16)

    # Afficher le graphique
    plt.show()


In [None]:
def error_discrete_feature(df, col):
    # Créer un graphique en nuage de points pour représenter les erreurs
    fig, ax = plt.subplots()
    ax.scatter(df[col], df['error_pred'], label=col)

    # Ajouter les labels des axes et la légende
    ax.set_xlabel(col)
    ax.set_ylabel('error_pred')

    plt.show()

In [None]:
def error_all_discr_category(df, y_test, y_pred, features):
    # Créer une figure avec des subplots
    fig, axs = plt.subplots(nrows=len(features), ncols=1, figsize=(8, 4 * len(features)), gridspec_kw={'hspace': 0.3})

    # Calculer les erreurs moyennes pour chaque catégorie
    df["error_pred"] = abs((np.array(y_test).reshape(-1,) - y_pred))
        
    # Boucle sur les features
    for i, feature in enumerate(features):
        # Récupérer l'axe courant
        ax = axs[i]

        ax.scatter(df[feature], df['error_pred'], label=feature)

        # Ajouter un titre pour chaque subplot
        ax.set_title(feature)

    # Ajouter un titre pour le graphique global
    fig.suptitle('Erreurs en fonction des variables catégorielles', y=0.92, fontsize=16)

    # Afficher le graphique
    plt.show()

disc_col_viz = ["Building_age","%_GFAParking","%_GFABuilding","LargestPropertyUseTypeGFA","NumberofBuildings","NumberofFloors"]

### Pipeline with a scaler

## Label 1 : energy

### Dummy regression

In [None]:
dummy_regr = DummyRegressor()
parameters = {"strategy" : ("mean","median","quantile"),
            "quantile" : [0.25,0.5,0.75]}
grid_dummy_nrg = GridSearchCV(dummy_regr, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit = "r2")
grid_dummy_nrg.fit(X_nrg_train, y_nrg_train)

In [None]:
metrics_df_nrg, y_pred_dummy = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_dummy_nrg,X_nrg_train, y_nrg_train)

### Linear Regression

In [None]:
reg = LinearRegression()
parameters = {'fit_intercept': [True, False]}
grid_reglin_nrg = GridSearchCV(reg,
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_reglin_nrg.fit(X_nrg_train, y_nrg_train)

In [None]:
metrics_df_nrg, y_pred_reg = evaluate_prediction(label_1, X_nrg_test, y_nrg_test, metrics_df_nrg, grid_reglin_nrg,X_nrg_train, y_nrg_train)

### Lasso Regression

In [None]:
lasso = Lasso(random_state=42)
alpha_space = np.logspace(-4, 2, 15)   # Checking for alpha from .0001 to 1 and finding the best value for alpha
parameters = {"alpha" : alpha_space}
grid_lasso_nrg = GridSearchCV(lasso, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_lasso_nrg.fit(X_nrg_train, y_nrg_train)

In [None]:
metrics_df_nrg, y_pred_lasso = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_lasso_nrg, X_nrg_train, y_nrg_train)

In [None]:
show_fetures_coef(grid_lasso_nrg, columns_enrg)

### Ridge regression

In [None]:
ridge = Ridge(random_state=42)
alpha_space = np.logspace(-5, 3, 15)   # Checking for alpha from .0001 to 1 and finding the best value for alpha
parameters = {"alpha" : alpha_space}
grid_ridge_nrg = GridSearchCV(ridge, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_ridge_nrg.fit(X_nrg_train, y_nrg_train)

In [None]:
metrics_df_nrg, y_pred_ridge = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_ridge_nrg, X_nrg_train, y_nrg_train)

In [None]:
show_fetures_coef(grid_ridge_nrg, columns_enrg)

### Elastic Net

In [None]:
elnet = ElasticNet(max_iter=20000,random_state=42)
alpha_space = np.logspace(-5, 3, 15)   # Checking for alpha from .0001 to 1 and finding the best value for alpha
l1_space = np.logspace(-3, 0, 15)
parameters = {"alpha" : alpha_space,
            "l1_ratio" : l1_space}
grid_elnet_nrg = GridSearchCV(elnet, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_elnet_nrg.fit(X_nrg_train, y_nrg_train)

metrics_df_nrg, y_pred_elnet = evaluate_prediction(label_1, X_nrg_test, y_nrg_test, metrics_df_nrg, grid_elnet_nrg, X_nrg_train, y_nrg_train)

In [None]:
hist_errors(y_nrg_test, y_pred_elnet)

In [None]:
error_discrete_feature(df_nrg_selection, 'PropertyGFATotal')

In [None]:
cat_features_next = ['BuildingType',
 'PrimaryPropertyType',
 'Neighborhood',
 'LargestPropertyUseType']

In [None]:
# Créer une figure avec des subplots
fig, axs = plt.subplots(nrows=len(cat_features_next), ncols=1, figsize=(8, 4 * len(cat_features_next)), gridspec_kw={'hspace': 0.6})
axs[0].set_ylabel('Moyenne des erreurs')

# Boucle sur les features
for i, feature in enumerate(cat_features_next):
    # Récupérer l'axe courant
    ax = axs[i]

    # Calculer les erreurs moyennes pour chaque catégorie
    df_nrg_selection["error_pred"] = abs((np.array(y_nrg_test).reshape(-1,) - y_pred_elnet))
    df_grouped = df_nrg_selection.groupby(feature).mean()['error_pred']
    counts = df_nrg_selection[feature].value_counts()

    # Afficher les barres de la moyenne des erreurs
    ax.bar(df_grouped.index, df_grouped.values)

    # Ajouter le nombre d'occurrences sur les barres de la moyenne des erreurs
    for i, v in enumerate(df_grouped.values):
        ax.text(i, v, str(counts[df_grouped.index[i]]), ha='center', va='bottom')

    # Ajouter un titre pour chaque subplot
    ax.set_title(feature)

    # Ajouter un espace supplémentaire de 10% au-dessus et en dessous des barres du graphique
    ax.margins(y=0.1)
    
    # Ajouter des étiquettes d'axe plus petites et penchées
    ax.set_xticklabels(df_grouped.index, rotation=45, fontsize=7)

# Ajouter un titre pour le graphique global
fig.suptitle('Erreurs en fonction des variables catégorielles', y=0.95, fontsize=16)

# Afficher le graphique
plt.show()


### Decision tree Regressor

In [None]:
dectree = DecisionTreeRegressor(random_state=42)
parameters = {'min_samples_leaf': [1, 2, 3],
            'max_depth': range(5,10)}

grid_dectree_nrg = GridSearchCV(dectree, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_dectree_nrg.fit(X_nrg_train, y_nrg_train)

metrics_df_nrg, y_pred_dectree = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_dectree_nrg, X_nrg_train, y_nrg_train)

In [None]:
hist_errors(y_nrg_test, y_pred_dectree)

### RandomForestRegressor

In [None]:
randomforest = RandomForestRegressor(random_state=42)
parameters = {'min_samples_leaf': [1, 2, 3],
            'criterion': ["squared_error", "absolute_error", "friedman_mse", "poisson"],
            'max_features': [0.3,0.5,0.75,1,"sqrt","log2"]}

grid_randomforest_nrg = GridSearchCV(randomforest, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_randomforest_nrg.fit(X_nrg_train, y_nrg_train.values.ravel())

metrics_df_nrg, y_pred_randomforest = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_randomforest_nrg, X_nrg_train, y_nrg_train)

In [None]:
hist_errors(y_nrg_test, y_pred_randomforest)

### XGboost

In [None]:
# Créer le ColumnTransformer pour gérer les différentes colonnes avec des normalizers
# Définir les normalizers à tester
scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('MaxAbsScaler', MaxAbsScaler())
]

# Définir les paramètres pour le GridSearchCV
parameters = {
    'scaler': [scaler for _, scaler in scalers]
}

# Créer le pipeline avec le scaler et le modèle
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('xgboost', xgboost.XGBRegressor(random_state=42))
])

grid_xgboost_nrg = GridSearchCV(pipeline, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_xgboost_nrg.fit(X_nrg_train, y_nrg_train)

metrics_df_nrg, y_pred_xgboost = evaluate_prediction(label_1, X_nrg_test, y_nrg_test, metrics_df_nrg, grid_xgboost_nrg, X_nrg_train, y_nrg_train)

In [None]:
XGboost = xgboost.XGBRegressor(random_state=42)
eta = np.logspace(-5, 0.3, 5)
L1_reg = np.logspace(-10, 10, 5)
L2_reg = np.logspace(-10, 10, 5)
parameters = {'max_depth': [3,7,11],
            'learning_rate': eta,
            'reg_alpha': L1_reg,
            'reg_lambda' : L2_reg,
            'colsample_bytree' : [0.25,0.5,0.75,1]
            }

grid_xgboost_nrg = GridSearchCV(XGboost, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_xgboost_nrg.fit(X_nrg_train, y_nrg_train)

metrics_df_nrg, y_pred_xgboost = evaluate_prediction(label_1, X_nrg_test, y_nrg_test, metrics_df_nrg, grid_xgboost_nrg,X_nrg_train, y_nrg_train)

In [None]:
hist_errors(y_nrg_test, y_pred_xgboost)

### LightBoost

In [None]:
lightboost = LGBMRegressor(random_state=42)
parameters = {'n_estimators': range(1000,1500,100),
            'learning_rate': [0.01,0.001,0.03,0.1,0.3]}

grid_lightboost_nrg = GridSearchCV(lightboost, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_lightboost_nrg.fit(X_nrg_train, y_nrg_train)

metrics_df_nrg, y_pred_lgboost = evaluate_prediction(label_1,X_nrg_test, y_nrg_test, metrics_df_nrg, grid_lightboost_nrg, X_nrg_train, y_nrg_train)

## Label 2 - GHE

### Dummy Regression

In [None]:
dummy_regr = DummyRegressor()
parameters = {"strategy" : ("mean","median")}
grid_dummy_ghe = GridSearchCV(dummy_regr, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_dummy_ghe.fit(X_ghe_train, y_ghe_train)

In [None]:
metrics_df_ghe, y_pred_dummy_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_dummy_ghe, X_ghe_train, y_ghe_train)

### Linear Regression

In [None]:
parameters = {'fit_intercept': [True, False]}
grid_reglin_ghe = GridSearchCV(reg,
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_reglin_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_reg_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_reglin_ghe, X_ghe_train, y_ghe_train)

### Lasso Regression

In [None]:
lasso = Lasso(max_iter=10000, tol=0.0001,random_state=42)
alpha_space = np.logspace(-5, 5, 15)   # Checking for alpha from .0001 to 10000 and finding the best value for alpha
parameters = {"alpha" : alpha_space}
grid_lasso_ghe = GridSearchCV(lasso, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_lasso_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_lasso_ghe = evaluate_prediction(label_2, X_ghe_test, y_ghe_test, metrics_df_ghe, grid_lasso_ghe, X_ghe_train, y_ghe_train)

In [None]:
show_fetures_coef(grid_lasso_ghe, columns_ghe)

### vérification du scaling des données

In [None]:
fig = plt.figure(figsize=(16, 12))
for feat_idx in range(X_enrg_test.shape[1]):
    ax = fig.add_subplot(7,5, (feat_idx+1))
    h = ax.hist(X_nrg_test[:, feat_idx], bins=50, color='steelblue', density=True, edgecolor='none')
    ax.set_title(columns_enrg[feat_idx], fontsize=10)

### Ridge 

In [None]:
ridge = Ridge(random_state=42)
alpha_space = np.logspace(-5, 3, 15)   # Checking for alpha from .0001 to 1 and finding the best value for alpha
parameters = {"alpha" : alpha_space}
grid_ridge_ghe = GridSearchCV(ridge, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_ridge_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_ridge_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_ridge_ghe, X_ghe_train, y_ghe_train)

In [None]:
show_fetures_coef(grid_ridge_nrg, columns_enrg)

### Elastic Net

In [None]:
elnet = ElasticNet(max_iter=20000,random_state=42)
alpha_space = np.logspace(-5, 3, 15)   # Checking for alpha from .0001 to 1 and finding the best value for alpha
l1_space = np.logspace(-3, 0, 15)
parameters = {"alpha" : alpha_space,
            "l1_ratio" : l1_space}
grid_elnet_ghe = GridSearchCV(elnet, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_elnet_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_elnet_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_elnet_ghe, X_ghe_train, y_ghe_train)

### Decision tree Regressor

In [None]:
dectree = DecisionTreeRegressor(random_state=42)
parameters = {'min_samples_leaf': [1, 2, 3,4,5],
            'max_depth': range(5,10)}

grid_dectree_ghe = GridSearchCV(dectree, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_dectree_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_dectree_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_dectree_ghe, X_ghe_train, y_ghe_train)

### RandomForest

In [None]:
randomforest = RandomForestRegressor(random_state=42)
parameters = {'min_samples_leaf': [1, 2, 3],
            'criterion': ["squared_error", "absolute_error", "friedman_mse", "poisson"],
            'max_features': [0.3,0.5,0.75,1,"sqrt","log2"]}

grid_randomforest_ghe = GridSearchCV(randomforest, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_randomforest_ghe.fit(X_ghe_train, y_ghe_train.values.ravel())

metrics_df_ghe, y_pred_forest_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_randomforest_ghe, X_ghe_train, y_ghe_train)

### XGBoost

In [None]:
'n_estimators': range(1000,1500,100),
            'eta': [0.01,0.001,0.03,0.1,0.3],
            'max_features': [0.3,0.5,0.75,1,"sqrt","log2"]

In [None]:
parameters = {}

grid_xgboost_ghe = GridSearchCV(XGBoost, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_xgboost_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_xgboost_ghe = evaluate_prediction(label_2, X_ghe_test, y_ghe_test, metrics_df_ghe, grid_xgboost_ghe, X_ghe_train, y_ghe_train)

### LightBoost

In [None]:
lightboost = LGBMRegressor(random_state=42)
parameters = {'n_estimators': range(1000,1500,100),
            'learning_rate': [0.01,0.001,0.03,0.1,0.3]}

grid_lightboost_ghe = GridSearchCV(lightboost, 
                            param_grid=parameters,
                            cv=5,
                            n_jobs=-1, 
                            scoring=scoring,
                            refit="r2")
grid_lightboost_ghe.fit(X_ghe_train, y_ghe_train)

metrics_df_ghe, y_pred_lgboost_ghe = evaluate_prediction(label_2,X_ghe_test, y_ghe_test, metrics_df_ghe, grid_lightboost_ghe, X_ghe_train, y_ghe_train)