# Projet 3 - Anticipez les besoins en consommation de bâtiments

## Notebook de predictions

Le but de ce notebook est d'utiliser le dataset clean généré par l'analyse exploratoire, et de créer des modèles prédictifs pour les consommations énergétiques et l'émission de CO2.

In [1]:
import pandas as pd
from MLUtils import DataAnalysis, DataEngineering

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importation du jeu de données
df = pd.read_csv('data/clean.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337 entries, 0 to 3336
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   BuildingType                        3337 non-null   int64  
 1   PrimaryPropertyType                 3337 non-null   int64  
 2   NumberofBuildings                   3337 non-null   float64
 3   NumberofFloors                      3337 non-null   float64
 4   PropertyGFAParking                  3337 non-null   int64  
 5   ENERGYSTARScore                     2524 non-null   float64
 6   SiteEUI(kBtu/sf)                    3337 non-null   float64
 7   SiteEUIWN(kBtu/sf)                  3337 non-null   float64
 8   SourceEUI(kBtu/sf)                  3337 non-null   float64
 9   SourceEUIWN(kBtu/sf)                3337 non-null   float64
 10  SiteEnergyUse(kBtu)                 3337 non-null   float64
 11  SiteEnergyUseWN(kBtu)               3337 no

In [4]:
df.describe()

Unnamed: 0,BuildingType,PrimaryPropertyType,NumberofBuildings,NumberofFloors,PropertyGFAParking,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),...,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,Age,EnergyUse_Age_Ratio
count,3337.0,3337.0,3337.0,3337.0,3337.0,2524.0,3337.0,3337.0,3337.0,3337.0,...,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0
mean,0.492958,1.002098,1.107582,1.19869,8063.891819,67.947306,55.075187,57.41843,135.028289,138.631885,...,0.112376,0.074618,0.125562,0.05484,0.082709,0.065328,0.028169,0.048846,54.464789,1.611193
std,0.500025,1.221925,2.118095,1.029832,32475.541333,26.851849,56.393783,57.270796,139.534209,139.315264,...,0.315876,0.262813,0.331405,0.227701,0.275483,0.247141,0.16548,0.215579,33.14861,2.348184
min,0.0,0.0,0.0,-9.21034,0.0,1.0,0.4,0.0,0.0,-2.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.006195
25%,0.0,0.0,1.0,0.693197,0.0,53.0,28.0,29.5,74.900002,78.699997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.507143
50%,0.0,0.0,1.0,1.386319,0.0,75.0,38.799999,41.0,96.5,101.400002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.861702
75%,1.0,2.0,1.0,1.609458,0.0,90.0,60.599998,64.5,144.800003,149.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75.0,1.857778
max,1.0,3.0,111.0,4.595121,512608.0,100.0,834.400024,834.400024,2620.0,2620.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,123.0,53.963634


#### Normalisation des données avec MinMax

In [5]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Chargement du DataFrame original
df = pd.read_csv('data/clean.csv')

# Sélection des colonnes numériques
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Sélection des colonnes non numériques
non_numeric_columns = df.select_dtypes(exclude=['int64', 'float64']).columns

# Création du scaler
scaler = MinMaxScaler()

# Application de la normalisation sur les colonnes numériques
scaled_numeric_data = scaler.fit_transform(df[numeric_columns])

# Création d'un DataFrame pour les données normalisées
df_scaled_numeric = pd.DataFrame(scaled_numeric_data, columns=numeric_columns)

# Combinaison des données numériques normalisées avec les données non numériques
df_scaled = pd.concat([df_scaled_numeric, df[non_numeric_columns].reset_index(drop=True)], axis=1)

# Affichage des premières lignes pour vérifier la création de df_scaled
print(df_scaled.head())



   BuildingType  PrimaryPropertyType  NumberofBuildings  NumberofFloors  \
0           1.0             0.333333           0.009009        0.847147   
1           1.0             0.333333           0.009009        0.840844   
2           1.0             0.333333           0.009009        0.936145   
3           1.0             0.333333           0.009009        0.833941   
4           1.0             0.333333           0.009009        0.876517   

   PropertyGFAParking  ENERGYSTARScore  SiteEUI(kBtu/sf)  SiteEUIWN(kBtu/sf)  \
0            0.000000         0.595960          0.097482            0.101031   
1            0.029387         0.606061          0.113189            0.117330   
2            0.383759         0.424242          0.114628            0.117090   
3            0.000000         0.555556          0.132374            0.135786   
4            0.120950         0.747475          0.137170            0.142258   

   SourceEUI(kBtu/sf)  SourceEUIWN(kBtu/sf)  ...  \
0            0.0

# Remplissage de la colonne ENERGYSTARScore en fonction des autres colonnes.

Le dataset ne contenait pas tous les scores ENERGYSTAR. Nous allons tout d'abord créer et appliquer un modèle qui va remplir cette colonne.

In [6]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

# Séparation du DataFrame en deux ensembles
df_scaled_with_score = df_scaled[df_scaled['ENERGYSTARScore'].notna()]
df_scaled_without_score = df_scaled[df_scaled['ENERGYSTARScore'].isna()]

# Séparation des caractéristiques (X) et de la cible (y) - Assurez-vous de retirer 'ENERGYSTARScore'
X = df_scaled_with_score.drop('ENERGYSTARScore', axis=1)
y = df_scaled_with_score['ENERGYSTARScore']

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [7]:
from sklearn.linear_model import LinearRegression

# Création et entraînement du modèle
model = LinearRegression()
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")





Mean Squared Error: 0.07471191637131842


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet


# alpha et l1_ratio doivent être ajustés en fonction de vos données
elastic_net_model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_net_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred = elastic_net_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 0.07643471088243874


In [9]:
from sklearn.linear_model import Lasso

# Création et entraînement du modèle Lasso
lasso_model = Lasso(alpha=0.1, random_state=42)
lasso_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f"Mean Squared Error (Lasso): {mse_lasso}")


Mean Squared Error (Lasso): 0.07643471088243874


In [10]:
from sklearn.linear_model import Ridge

# Création et entraînement du modèle Ridge
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Mean Squared Error (Ridge): {mse_ridge}")


Mean Squared Error (Ridge): 0.06954010769662959


In [11]:
from sklearn.svm import SVR

# Création et entraînement du modèle SVR
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_svr = svr_model.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"Mean Squared Error (SVR): {mse_svr}")


Mean Squared Error (SVR): 0.04620641297302419


In [12]:
from xgboost import XGBRegressor

# Création et entraînement du modèle XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"Mean Squared Error (XGBoost): {mse_xgb}")


Mean Squared Error (XGBoost): 0.03701328577625559


Le modèle XGBoost donne les meilleurs résultats. On l'utilise pour remplir les valeurs manquantes.

In [None]:
df.sample(10)

Unnamed: 0,BuildingType,PrimaryPropertyType,NumberofBuildings,NumberofFloors,PropertyGFAParking,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),...,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,Age,EnergyUse_Age_Ratio
1178,1,1,1.0,0.693197,16518,27.0,67.400002,71.199997,211.800003,223.600006,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,22,3.063636
745,0,0,1.0,1.609458,0,0.923998,28.799999,29.700001,75.599998,76.599998,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,13,2.215385
2400,0,0,1.0,1.791776,23180,86.0,46.799999,49.900002,98.199997,103.599998,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24,1.95
1843,0,0,1.0,1.386319,0,0.906111,19.5,20.200001,61.299999,63.299999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,0.8125
1525,0,0,1.0,1.098646,0,99.0,21.1,22.700001,57.299999,60.700001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,0.781481
2932,0,0,1.0,1.386319,0,86.0,20.799999,22.1,65.199997,69.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,56,0.371429
639,0,0,1.0,0.693197,0,68.0,54.299999,59.400002,80.099998,85.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,64,0.848437
2242,0,3,1.0,1.609458,0,75.0,39.0,40.900002,122.400002,128.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,32,1.21875
2612,0,0,1.0,1.386319,0,74.0,20.9,21.799999,65.5,68.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24,0.870833
2197,0,0,1.0,1.386319,0,66.0,29.6,31.6,93.0,99.099998,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,38,0.778947


In [None]:
df.describe()

Unnamed: 0,BuildingType,PrimaryPropertyType,NumberofBuildings,NumberofFloors,PropertyGFAParking,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),...,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,Age,EnergyUse_Age_Ratio
count,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,...,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0,3337.0
mean,0.492958,1.002098,1.107582,1.19869,8063.891819,51.532093,55.075187,57.41843,135.028289,138.631885,...,0.112376,0.074618,0.125562,0.05484,0.082709,0.065328,0.028169,0.048846,54.464789,1.611193
std,0.500025,1.221925,2.118095,1.029832,32475.541333,37.176856,56.393783,57.270796,139.534209,139.315264,...,0.315876,0.262813,0.331405,0.227701,0.275483,0.247141,0.16548,0.215579,33.14861,2.348184
min,0.0,0.0,0.0,-9.21034,0.0,-0.042329,0.4,0.0,0.0,-2.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.006195
25%,0.0,0.0,1.0,0.693197,0.0,1.0,28.0,29.5,74.900002,78.699997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.507143
50%,0.0,0.0,1.0,1.386319,0.0,62.0,38.799999,41.0,96.5,101.400002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.861702
75%,1.0,2.0,1.0,1.609458,0.0,86.0,60.599998,64.5,144.800003,149.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75.0,1.857778
max,1.0,3.0,111.0,4.595121,512608.0,100.0,834.400024,834.400024,2620.0,2620.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,123.0,53.963634


In [None]:
# write the resulting dataframe to a csv file
df.to_csv('data/clean_with_score.csv', index=False)