In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math
%matplotlib inline

from matplotlib.collections import LineCollection
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import datasets, linear_model
from IPython.display import display
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor




import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Dans ce notebook, on se propose d'entrainer 4 modèle pour nos 2 target
# le travail se décompose comme suit:

# A. Target = SiteEnergyuse
    # A.1 sans energystarscore
    # A.2 avec energystarscore
    
# B. Target = Emissions CO2
    # B.1 sans energystarscore
    # B.2 avec energystarscore

# Puis on suit ces quelques etapes pour étudier A1.A2,B1,B2:

    # 1. Préapration du dataset: 
        # 1.a Séparation de X,y
        # 1.b Création d'une' pipeline avec transformation (prise en compte ou non de l'energystarscore avec drop des N.A >> réduction du dataset)
        
    # 2. Création d'une baseline de régression (Dummy)
    
    # 3. Entrainement des modèles : avec paramètre par défaut puis en utilisant gridsearch en ajustant les params
        # 3.a Elastic Net
        # 3.b Random Forest
        # 3.c Supervised Vector Machine
        # 3.d Gradient Boost
        
    # 4. Tableau récapitulatif des scores de chaque modèle avc ses meilleurs paramètres
    
    # 5. Choix du meilleur modèle
    
    # 6. Barplot des 20 top features du meilleur modèle


In [None]:
data = pd.read_csv("/kaggle/input/datap3/data_p3.csv", sep=',',low_memory=False)

In [None]:
# Sauvegarde séparée du energy star score

energy_star_score = data['ENERGYSTARScore']
data.drop('ENERGYSTARScore', axis=1, inplace=True)

In [None]:
data

In [None]:
TARGET = 'log_SiteEnergyUseWN(kBtu)'

In [None]:
# Préapration du dataset: Séparation de X,y, création de pipeline avec transformation

y = data[TARGET]
X = data.copy().drop(['log_GHGEmissions(MetricTonsCO2e)','log_SiteEnergyUseWN(kBtu)'], axis=1)

print(data.shape)

# Séparation et préparation des données

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

numeric_features = ['YearBuilt','log_PropertyGFATotal', 'log_LargestPropertyUseTypeGFA',
                    'log_largest/total', 'log_PropertyGFABuilding(s)','log_PropertyGFAParking']

"""numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan,strategy='mean')),
    ('scaler', StandardScaler())])"""
    
numeric_transformer = StandardScaler()

categorical_features = ['Neighborhood','LargestPropertyUseType']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

print("matrice X_train:", X_train.shape)
print("matrice y_train:", y_train.shape)

In [None]:
# Création d'une baseline de régression

def baseline():
    
    dummy = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', DummyRegressor())])
    
    print('--------------------------------')
    print("* Dummy")
    print('***')
    print("R2")
    print(cross_val_score(dummy,X_train, y_train, cv=5, scoring = "r2"))
    print('***')
    print("-Mse")
    print(cross_val_score(dummy,X_train, y_train, cv=5, scoring = "neg_mean_squared_error"))
    print('--------------------------------')
    
    mean = (cross_val_score(dummy,X_train, y_train, cv=5, scoring = "r2")).mean()
    new_row = {'Modele':'Dummy','Score' : mean }
    
    return new_row

In [None]:
# Apres un premier test avec la regréssion linéaire, ridge, lasso et treeregression, on se propose d'étudier le cas d'Elastic Net, Random forest qui sont des cas géneraux

def elastic_net():
    
    # 1. Elastic net

    elastic_net = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', ElasticNet())])

    print('--------------------------------')
    print("1. Elastic net")
    print('***')
    print("R2")
    print(cross_val_score(elastic_net,X_train, y_train, cv=5, scoring = "r2" ))
    print('***')
    print("-Mse")
    print(cross_val_score(elastic_net,X_train, y_train, cv=5, scoring = "neg_mean_squared_error"))
    print('--------------------------------\n')


    alphas = [0.001, 0.01, 0.1, 1, 10]

    l1_ratio = [0.001,0.5,1]

    parameters = { 
                "regressor__alpha": alphas,  ##alpha, coef qui multiplie le terme de pénalité
                "regressor__max_iter":[5000],
                "regressor__l1_ratio": l1_ratio # L1 ratio , =1 équivaut à un Lasso, 0 à un Ridge
                 } 



    NetCV = GridSearchCV(estimator = elastic_net, 
                          param_grid = parameters,
                          scoring = 'r2',
                          cv=5,
                          verbose=0,
                         return_train_score = True
                         )

    NetCV.fit(X_train, y_train)

    result = pd.DataFrame(NetCV.cv_results_)
    display(result)

    new_row = {'Modele':'Regression Elastic net','Score' : [result[result['rank_test_score']==1]['mean_test_score']]}
    
    print(NetCV.best_params_)
    print('**********')
    print(NetCV.best_estimator_['regressor'].coef_)
    
    return new_row

In [None]:
# 2. Random Forest Regressor

def random_forest():

    forest = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', RandomForestRegressor())])

    print('--------------------------------')
    print("2. Random Forest Regressor")
    print('***')
    print("R2")
    print(cross_val_score(forest,X_train, y_train, cv=5, scoring = "r2" ))
    print('***')
    print("-Mse")
    print(cross_val_score(forest,X_train, y_train, cv=5, scoring = "neg_mean_squared_error"))
    print('--------------------------------\n')

    parameters = {
                #'min_samples_leaf' : [1,10,100], #nombre de feuilles minimales dans un noeud
                #'min_samples_split'
                #'regressor__max_features': ['auto', 'sqrt', 'log2'], #nombre de features observées pour chaque arbre
                #'max_depth'
                #'regressor__n_estimators': [500]
            }

    forestCV = GridSearchCV(estimator = forest, 
                          param_grid = parameters,
                          scoring = 'r2',
                          cv=5,
                          verbose=0,
                         return_train_score = True
                         )

    forestCV.fit(X_train, y_train)

    result = pd.DataFrame(forestCV.cv_results_)
    display(result)

    new_row = {'Modele':'Random Forest Regressor','Score' : [result[result['rank_test_score']==1]['mean_test_score']]}
    
    print(forestCV.best_params_)
    print('**********')
    print(forestCV.best_estimator_['regressor'].feature_importances_)
    
    return new_row

In [None]:
# 3. Support Vector Regression

def support_vector():
    
    svr = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', SVR(kernel="linear"))])

    print('--------------------------------')
    print("3. Support Vector Regression")
    print('***')
    print("R2")
    print(cross_val_score(svr,X_train, y_train, cv=5, scoring = "r2" ))
    print('***')
    print("-Mse")
    print(cross_val_score(svr,X_train, y_train, cv=5, scoring = "neg_mean_squared_error"))
    print('--------------------------------\n')


    parameters = {
                'regressor__epsilon' : [0.01, 0.1],
                'regressor__C' : [ 0.1, 1, 10] #parametre de régularisation
                } 

    svm_CV = GridSearchCV(estimator = svr, 
                      param_grid = parameters,
                      scoring = 'r2',
                      cv=5,
                   verbose=2
                     )

    svm_CV.fit(X_train, y_train)

    result = pd.DataFrame(svm_CV.cv_results_)
    display(result)

    new_row = {'Modele':'Support Vector Regression','Score' : [result[result['rank_test_score']==1]['mean_test_score']]}

    print(svm_CV.best_params_)
    print('**********')
    print(svm_CV.best_estimator_['regressor'].coef_)
    
    return new_row

In [None]:
# 4. Gradient Boosting

def gradient_boosting():

    gb = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor',GradientBoostingRegressor())]) 

    print('--------------------------------')
    print("4. Gradient Boosting")
    print('***')
    print("R2")
    print(cross_val_score(gb,X_train, y_train, cv=5, scoring = "r2" ))
    print('***')
    print("-Mse")
    print(cross_val_score(gb,X_train, y_train, cv=5, scoring = "neg_mean_squared_error"))
    print('--------------------------------\n')


    parameters = {
                'regressor__n_estimators': [250, 500, 750],
                'regressor__loss': ['huber'] # Combinaisaon de 'ls', 'lad'

                    }
    gb_CV = GridSearchCV(estimator = gb, 
                      param_grid = parameters,
                      scoring = 'r2',
                      cv=5,
                   verbose=2
                     )

    gb_CV.fit(X_train, y_train)

    result = pd.DataFrame(gb_CV.cv_results_)
    display(result)

    new_row = {'Modele':'Gradient Boosting','Score' : [result[result['rank_test_score']==1]['mean_test_score']]}

    print(gb_CV.best_params_)
    print('**********')
    print(gb_CV.best_estimator_['regressor'].feature_importances_)
    
    return new_row

In [None]:
def models():
    
    # Création d'un tableau du meilleur résultat par modèle
  

    scores = pd.DataFrame()
    scores = scores.append(baseline(),ignore_index=True)
    scores = scores.append(elastic_net(),ignore_index=True)
    scores = scores.append(random_forest(),ignore_index=True)
    scores = scores.append(support_vector(),ignore_index=True)
    scores = scores.append(gradient_boosting(),ignore_index=True)
    
    print(scores)

In [None]:
models()

In [None]:
# Choix du meilleur modele avec les meilleurs paramètres

gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',GradientBoostingRegressor(n_estimators= 250, loss='huber'))]) 

gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
r2_score(y_pred,y_test)

In [None]:
# Visualisation of the top 20 features

def top_features():
    
    # Récuperer les noms de chaque feature
    onehot_features = gb['preprocessor'].transformers_[1][1]\
                       .get_feature_names(categorical_features)
    feature_names = np.concatenate([numeric_features, onehot_features])

    # Récuperer les coeffcients (ou features importance) de chaque feature
    coefs = gb.named_steps["regressor"].feature_importances_.flatten()

    # Zipper les coefficients et les noms ensemble et en faire une dataframe
    zipped = zip(feature_names, coefs)
    df = pd.DataFrame(zipped, columns=["feature", "value"])

    # Trier les features par valeur absolue de leur coefficient
    df["abs_value"] = df["value"].apply(lambda x: abs(x))
    df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
    df = df.sort_values("abs_value", ascending=False)
    
    # Création et affichage du barplot
    fig, ax = plt.subplots(1, 1, figsize=(12, 7))
    sns.barplot(x="feature",
                y="value",
                data=df.head(20),
               palette=df.head(20)["colors"])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
    ax.set_title("Top 20 Features", fontsize=25)
    ax.set_ylabel("feature_importances", fontsize=22)
    ax.set_xlabel("Feature Name", fontsize=22)

In [None]:
top_features()

In [None]:
# Modèles avec Energystarscore

data['ENERGYSTARScore'] = energy_star_score
data = data.dropna()
print(data.shape)


# Préapration du dataset: Séparation de X,y, création de pipeline avec transformation

y = data[TARGET]
X = data.copy().drop(['log_GHGEmissions(MetricTonsCO2e)','log_SiteEnergyUseWN(kBtu)'], axis=1)


# Séparation et préparation des données

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

numeric_features = ['ENERGYSTARScore','YearBuilt','log_PropertyGFATotal', 'log_LargestPropertyUseTypeGFA',
                    'log_largest/total', 'log_PropertyGFABuilding(s)','log_PropertyGFAParking']

numeric_transformer = StandardScaler()

categorical_features = ['Neighborhood','LargestPropertyUseType']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

print("matrice X_train:", X_train.shape)
print("matrice y_train:", y_train.shape)

In [None]:
models()

In [None]:
# Choix du meilleur modele avec les meilleurs paramètres

gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',GradientBoostingRegressor(n_estimators= 250, loss='huber'))]) 

gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
r2_score(y_pred,y_test)

In [None]:
top_features()

In [None]:
# Emission

data = pd.read_csv("/kaggle/input/datap3/data_p3.csv", sep=',',low_memory=False)

# Sauvegarde séparée du energy star score

energy_star_score = data['ENERGYSTARScore']
data.drop('ENERGYSTARScore', axis=1, inplace=True)

TARGET = 'log_GHGEmissions(MetricTonsCO2e)'

# Préapration du dataset: Séparation de X,y, création de pipeline avec transformation

y = data[TARGET]
X = data.copy().drop(['log_GHGEmissions(MetricTonsCO2e)','log_SiteEnergyUseWN(kBtu)'], axis=1)


# Séparation et préparation des données

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

numeric_features = ['YearBuilt','log_PropertyGFATotal', 'log_LargestPropertyUseTypeGFA',
                    'log_largest/total', 'log_PropertyGFABuilding(s)','log_PropertyGFAParking']

numeric_transformer = StandardScaler()

categorical_features = ['Neighborhood','LargestPropertyUseType']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

print("matrice X_train:", X_train.shape)
print("matrice y_train:", y_train.shape)

In [None]:
models()

In [None]:
# Choix du meilleur modele avec les meilleurs paramètres

gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',GradientBoostingRegressor(n_estimators= 250, loss='huber'))]) 

gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
r2_score(y_pred,y_test)

In [None]:
top_features()

In [None]:
# Modèles avec Energystarscore

data['ENERGYSTARScore'] = energy_star_score
data = data.dropna()
data.shape


# Préapration du dataset: Séparation de X,y, création de pipeline avec transformation

y = data[TARGET]
X = data.copy().drop(['log_GHGEmissions(MetricTonsCO2e)','log_SiteEnergyUseWN(kBtu)'], axis=1)


# Séparation et préparation des données

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

numeric_features = ['ENERGYSTARScore','YearBuilt','log_PropertyGFATotal', 'log_LargestPropertyUseTypeGFA',
                    'log_largest/total', 'log_PropertyGFABuilding(s)','log_PropertyGFAParking']

numeric_transformer = StandardScaler()

categorical_features = ['Neighborhood','LargestPropertyUseType']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

print("matrice X_train:", X_train.shape)
print("matrice y_train:", y_train.shape)

In [None]:
models()

In [None]:
# Choix du meilleur modele avec les meilleurs paramètres

gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',GradientBoostingRegressor(n_estimators= 250, loss='huber'))]) 

gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
r2_score(y_pred,y_test)

In [None]:
top_features()