# Projet 3 - Anticipez les besoins en consommation de bâtiments

## Notebook de predictions

Le but de ce notebook est d'utiliser le dataset clean généré par l'analyse exploratoire, et de créer des modèles prédictifs pour les consommations énergétiques et l'émission de CO2.

In [None]:
import pandas as pd
from MLUtils import DataAnalysis, DataEngineering

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importation du jeu de données
df = pd.read_csv('data/clean.csv')

In [None]:
df.info()

In [None]:
df.describe()

#### Normalisation des données avec MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Sélection des colonnes numériques
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Sélection des colonnes non numériques
non_numeric_columns = df.select_dtypes(exclude=['int64', 'float64']).columns

# Création du scaler
scaler = MinMaxScaler()

# Application de la normalisation sur les colonnes numériques
scaled_numeric_data = scaler.fit_transform(df[numeric_columns])

# Création d'un DataFrame pour les données normalisées
df_scaled_numeric = pd.DataFrame(scaled_numeric_data, columns=numeric_columns)

# Combinaison des données numériques normalisées avec les données non numériques
df_scaled = pd.concat([df_scaled_numeric, df[non_numeric_columns].reset_index()], axis=1)

# Affichage des premières lignes pour vérifier la création de df_scaled
print(df_scaled.head())



TODO: Dropper les lignes sans energystarscore.
Faire une prédiction de la colonne TotalGHGEmission.
Ensuite faire le feature importance (mesurer les features qui ont le plus d'impact).
Puis, déterminer si la colonne energystarscore est importante ou pas.
Conclure sur l'utilité de garder cette colonne.

# Remplissage de la colonne ENERGYSTARScore en fonction des autres colonnes.

Le dataset ne contenait pas tous les scores ENERGYSTAR. Nous allons tout d'abord créer et appliquer un modèle qui va remplir cette colonne.

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

# Séparation du DataFrame en deux ensembles
df_scaled_with_score = df_scaled[df_scaled['ENERGYSTARScore'].notna()]
df_scaled_without_score = df_scaled[df_scaled['ENERGYSTARScore'].isna()]

# Séparation des caractéristiques (X) et de la cible (y) - Assurez-vous de retirer 'ENERGYSTARScore'
X = df_scaled_with_score.drop('ENERGYSTARScore', axis=1)
y = df_scaled_with_score['ENERGYSTARScore']

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

# Création et entraînement du modèle
model = LinearRegression()
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet


# alpha et l1_ratio doivent être ajustés en fonction de vos données
elastic_net_model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_net_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred = elastic_net_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
from sklearn.linear_model import Lasso

# Création et entraînement du modèle Lasso
lasso_model = Lasso(alpha=0.1, random_state=42)
lasso_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f"Mean Squared Error (Lasso): {mse_lasso}")

In [None]:
from sklearn.linear_model import Ridge

# Création et entraînement du modèle Ridge
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Mean Squared Error (Ridge): {mse_ridge}")

In [None]:
from sklearn.svm import SVR

# Création et entraînement du modèle SVR
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_svr = svr_model.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"Mean Squared Error (SVR): {mse_svr}")

In [None]:
from xgboost import XGBRegressor

# Création et entraînement du modèle XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"Mean Squared Error (XGBoost): {mse_xgb}")

Le modèle XGBoost donne les meilleurs résultats. TODO: On l'utilise pour remplir les valeurs manquantes.

In [None]:
df.sample(10)

In [None]:
df.describe()

In [None]:
# write the resulting dataframe to a csv file
df.to_csv('data/clean_with_score.csv', index=False)

## On reprend le dataset original, pour déterminer si la colonne energystarscore est pertinente

On reprend le csv initial

In [None]:
df = pd.read_csv('data/clean.csv')

In [None]:
# remove empty lines for the column 'ENERGYSTARScore'
df = df[df['ENERGYSTARScore'].notna()]

In [None]:
DataAnalysis.show_columns_population(df, type="bar")

In [None]:
df.info()

#### Après avoir enlever les observations ayant un EnergyStarScore vide, nous obtenons un jeu de 2524 données.
Nous allons maintenant faire une prédiction pour la colonne TotalGHGEmissions.



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# Assuming 'df' is your DataFrame
y = df['TotalGHGEmissions']
X = df.drop('TotalGHGEmissions', axis=1)
X = X.drop('SiteEnergyUseWN(kBtu)', axis=1)
X = X.drop('SiteEnergyUse(kBtu)', axis=1)
X = X.drop('GHGEmissionsIntensity', axis=1)
X = X.drop('SourceEUI(kBtu/sf)', axis=1)
X = X.drop('SourceEUIWN(kBtu/sf)', axis=1)
X = X.drop('SiteEUI(kBtu/sf)', axis=1)
X = X.drop('SiteEUIWN(kBtu/sf)', axis=1)
# X = X.drop('ENERGYSTARScore', axis=1)
X = X.drop('EnergyUse_Age_Ratio', axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Computing feature importance
feature_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance.importances_mean})
importance_df = importance_df.sort_values(by='importance', ascending=False)


In [None]:
importance_df.head(30)

In [None]:
import plotly.express as px

# Drop negative values from the DataFrame
importance_df = importance_df[importance_df['importance'] > 0]

# Sorting values for better visualization
importance_df_sorted = importance_df.sort_values(by='importance')

# Creating the plot
fig = px.bar(importance_df_sorted, x='importance', y='feature', orientation='h', title="Feature Importances in Predicting Total GHG Emissions")
fig.show()


Nous allons prendre en considération les features importantes uniquement pour la suite de nos recherches de modèles de prédiction.
Etant donné l'aspect subjectif de la colonne ENERGYSTARScore (et malgré sa relative importance pour prédire les émissions de co²), nous allons aussi l'enlever.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# Assuming 'df' is your DataFrame
y = df['SiteEnergyUse(kBtu)']
X = df.drop('TotalGHGEmissions', axis=1)
X = X.drop('SiteEnergyUseWN(kBtu)', axis=1)
X = X.drop('SiteEnergyUse(kBtu)', axis=1)
X = X.drop('GHGEmissionsIntensity', axis=1)
X = X.drop('SourceEUI(kBtu/sf)', axis=1)
X = X.drop('SourceEUIWN(kBtu/sf)', axis=1)
X = X.drop('SiteEUI(kBtu/sf)', axis=1)
X = X.drop('SiteEUIWN(kBtu/sf)', axis=1)
# X = X.drop('ENERGYSTARScore', axis=1)
X = X.drop('EnergyUse_Age_Ratio', axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Computing feature importance
feature_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance.importances_mean})
importance_df = importance_df.sort_values(by='importance', ascending=False)


In [None]:
importance_df.head(30)

In [None]:
import plotly.express as px

# Drop negative values from the DataFrame
importance_df = importance_df[importance_df['importance'] > 0]

# Sorting values for better visualization
importance_df_sorted = importance_df.sort_values(by='importance')

# Creating the plot
fig = px.bar(importance_df_sorted, x='importance', y='feature', orientation='h', title="Feature Importances in Predicting Site Energy Use (kBtu)")
fig.show()

Nos analyses nous permettent de constater que les colonnes utiles pour les prédictions de TotalGHGEmissions (Emission de co²) et de SiteEnergyUse(kBtu) (Utilisation totale d'énergie) sont les suivantes :
- NumberofFloors
- NaturalGas(therms)
- PropertyGFAParking
- NaturalGas(kBtu)
- PrimaryPropertyType
- BuildingType
- Neighborhood_DOWNTOWN
- Neighborhood_GREATER DUWAMISH
- NumberofBuildings
- SteamUse(kBtu)

In [None]:
df = pd.read_csv('data/clean.csv')

# Only keep the following columns : NumberofFloors, NaturalGas(therms), PropertyGFAParking, NaturalGas(kBtu), PrimaryPropertyType, BuildingType, Neighborhood_DOWNTOWN, Neighborhood_GREATER DUWAMISH, NumberofBuildings, SteamUse(kBtu)
df = df[['TotalGHGEmissions', 'SiteEnergyUse(kBtu)', 'NumberofFloors', 'NaturalGas(therms)', 'PropertyGFAParking', 'NaturalGas(kBtu)', 'PrimaryPropertyType', 'BuildingType', 'Neighborhood_DOWNTOWN', 'Neighborhood_GREATER DUWAMISH', 'NumberofBuildings', 'SteamUse(kBtu)']]
df.info()

In [None]:
df.describe()

In [None]:
# Make a GridSearch to find the best parameters for the model

from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame
y = df['TotalGHGEmissions']
X = df.drop('TotalGHGEmissions', axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)








In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(random_state=42)

param_grid_rfr = {
    'n_estimators': [20,30,40],
    'max_depth': [12,13,11],
    'max_features': [5, 6, 7]
}

grid_search_rfr = GridSearchCV(estimator=model_rfr, param_grid=param_grid_rfr, cv=3, n_jobs=-1, verbose=2)

grid_search_rfr.fit(X_train, y_train)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Convert the grid_search_gbr.cv_results_ to a DataFrame
results = pd.DataFrame(grid_search_rfr.cv_results_)

# Select and rename columns for easier reading
results = results[['param_n_estimators', 'param_max_depth', 'param_max_features', 'mean_test_score']]
results.columns = ['N Estimators', 'Max Depth', 'Max Features', 'Mean Test Score']

# Melt the DataFrame for easier plotting
results_melted = results.melt(id_vars='Mean Test Score', var_name='Parameter', value_name='Value')

# Create the FacetGrid
g = sns.FacetGrid(results_melted, col='Parameter', sharex=False, sharey=False, col_wrap=2)
g = g.map(sns.scatterplot, 'Value', 'Mean Test Score')

# Add titles and adjust layout
g.set_titles(col_template="{col_name}", fontweight='bold', fontsize=14)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('GridSearchCV Results Across Different Parameters', fontsize=16)

plt.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Assuming that grid_search.cv_results_ is defined elsewhere and contains the grid search results
cv_results = pd.DataFrame(grid_search_rfr.cv_results_)

cv_results['param_n_estimators'] = cv_results['param_n_estimators'].astype(int)
cv_results['param_max_features'] = cv_results['param_max_features'].astype(str)

pivot_df = cv_results.pivot_table(index='param_n_estimators', columns='param_max_features', values='mean_test_score')

pivot_df.index = pivot_df.index.astype(str)
pivot_df.columns = pivot_df.columns.astype(str)

# Create a new DataFrame to hold the percentage format text
text = [[f"{val:.2%}" for val in row] for row in pivot_df.values]

fig = go.Figure(data=go.Heatmap(
    z=pivot_df.values,
    x=pivot_df.columns.tolist(),
    y=pivot_df.index.tolist(),
    colorscale='RdYlGn',
    reversescale=False,
    text=text,  # Add the percentage text
    texttemplate="%{text}",  # Use the text from the text argument
    hoverinfo="z+text"  # Show the percentage text on hover
))

fig.update_layout(
    title='Grid Search Results',
    xaxis_title='max_features',
    yaxis_title='n_estimators',
    height=500
)

fig.show()


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(random_state=42)

# New parameter grid for GradientBoostingRegressor
param_grid_gbr = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'max_features': ['sqrt', 'log2']
}

grid_search_gbr = GridSearchCV(estimator=model_gbr, param_grid=param_grid_gbr, cv=3, n_jobs=-1, verbose=2)

# Training the model
grid_search_gbr.fit(X_train, y_train)

In [None]:
import plotly.express as px
import pandas as pd

# Convert the grid_search_gbr.cv_results_ to a DataFrame
results = pd.DataFrame(grid_search_gbr.cv_results_)

# Filter the columns for plotting
plot_data = results.filter(regex='(param_n_estimators|param_learning_rate|mean_test_score)')

# Rename columns for easier reading
plot_data.rename(columns={
    'param_n_estimators': 'N Estimators',
    'param_learning_rate': 'Learning Rate',
    'mean_test_score': 'Mean Test Score'
}, inplace=True)

# Create a 3D scatter plot
fig = px.scatter_3d(
    plot_data, 
    x='N Estimators', 
    y='Learning Rate', 
    z='Mean Test Score', 
    color='Mean Test Score', 
    title='GridSearchCV Results'
)

fig.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Creating a pivot table for the heatmap
pivot_table = results.pivot_table(
    values='mean_test_score',
    index='param_n_estimators',
    columns='param_learning_rate'
)

# Create a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu')
plt.title('GridSearchCV Mean Test Scores')
plt.xlabel('Learning Rate')
plt.ylabel('N Estimators')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Convert the grid_search_gbr.cv_results_ to a DataFrame
results = pd.DataFrame(grid_search_gbr.cv_results_)

# Select and rename columns for easier reading
results = results[['param_n_estimators', 'param_learning_rate', 'param_max_depth', 'param_max_features', 'mean_test_score']]
results.columns = ['N Estimators', 'Learning Rate', 'Max Depth', 'Max Features', 'Mean Test Score']

# Melt the DataFrame for easier plotting
results_melted = results.melt(id_vars='Mean Test Score', var_name='Parameter', value_name='Value')

# Create the FacetGrid
g = sns.FacetGrid(results_melted, col='Parameter', sharex=False, sharey=False, col_wrap=2)
g = g.map(sns.scatterplot, 'Value', 'Mean Test Score')

# Add titles and adjust layout
g.set_titles(col_template="{col_name}", fontweight='bold', fontsize=14)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('GridSearchCV Results Across Different Parameters', fontsize=16)

plt.show()


On affine les paramètres selon cette dernière visualisation

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(random_state=42)

# New parameter grid for GradientBoostingRegressor
param_grid_gbr = {
    'n_estimators': [200, 300, 400],
    'learning_rate': [0.1, 0.15, 0.2],
    'max_depth': [5, 6 ,7],
    'max_features': ['sqrt', 'log2']
}

grid_search_gbr = GridSearchCV(estimator=model_gbr, param_grid=param_grid_gbr, cv=3, n_jobs=-1, verbose=2)

# Training the model
grid_search_gbr.fit(X_train, y_train)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Convert the grid_search_gbr.cv_results_ to a DataFrame
results = pd.DataFrame(grid_search_gbr.cv_results_)

# Select and rename columns for easier reading
results = results[['param_n_estimators', 'param_learning_rate', 'param_max_depth', 'param_max_features', 'mean_test_score']]
results.columns = ['N Estimators', 'Learning Rate', 'Max Depth', 'Max Features', 'Mean Test Score']

# Melt the DataFrame for easier plotting
results_melted = results.melt(id_vars='Mean Test Score', var_name='Parameter', value_name='Value')

# Create the FacetGrid
g = sns.FacetGrid(results_melted, col='Parameter', sharex=False, sharey=False, col_wrap=2)
g = g.map(sns.scatterplot, 'Value', 'Mean Test Score')

# Add titles and adjust layout
g.set_titles(col_template="{col_name}", fontweight='bold', fontsize=14)
plt.subplots_adjust(top=0.9)
g.fig.suptitle('GridSearchCV Results Across Different Parameters', fontsize=16)

plt.show()
