# Projet de MLOps
## Prédiction de la consommation anuelle d'électricité 

Auteurs: Lilou Masson, Paul Hamann Cossart

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from preparation import data_cleaning, create_dfs
from modelisation import test_model, search_params

## Préparation des données

In [2]:
df = pd.read_csv(
    'DATA/consommation-quotidienne-brute.csv',
    sep = ';'
)
df = data_cleaning(df)
df_gaz, df_ele = create_dfs(df)

In [3]:
df.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_gaz_tot,con_bru_ele_rte
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55353.0,59610.0
2012-01-01 00:30:00,2012-01-01,00:30:00,0.5,0.130526,0.991445,1,Sunday,1,52,1,...,0,0,0,0,1,,,,,58314.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55444.0,56230.0
2012-01-01 01:30:00,2012-01-01,01:30:00,1.5,0.382683,0.92388,1,Sunday,1,52,1,...,0,0,0,0,1,,,,,56075.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55465.0,55531.0


In [4]:
df_gaz.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_de_noe,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_gaz_tot
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55353.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55444.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55465.0
2012-01-01 03:00:00,2012-01-01,03:00:00,3.0,0.707107,0.707107,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56048.0
2012-01-01 04:00:00,2012-01-01,04:00:00,4.0,0.866025,0.5,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,57082.0


In [5]:
df_ele.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_de_noe,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_ele_rte
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,59610.0
2012-01-01 00:30:00,2012-01-01,00:30:00,0.5,0.130526,0.991445,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,58314.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56230.0
2012-01-01 01:30:00,2012-01-01,01:30:00,1.5,0.382683,0.92388,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56075.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55531.0


## Modélisation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from xgboost import XGBRegressor

### XGBoost v1
Toutes les variables, pas d'hyperparamètres

In [7]:
features = [
    'heu_sin',
    'heu_cos',
    'is_weekend',
    'week_of_month_sin',
    'week_of_month_cos',
    'vac_de_noe',
    'vac_d_hiv',
    'vac_de_pri',
    'vac_d_ete',
    'vac_de_la_tou',
    'top_fer',
    'tmi',
    'tma',
    'tmo',
]

In [8]:
rmse_v1, mae_v1, r2_v1, importances_v1, df_verif_v1 = test_model(
    df_gaz, features, '01/01/2025', 'xgboost'
)

In [9]:
print('XGBoost v1 - toutes les variables, pas d\'hyperparamètres')
print(f"RMSE: {rmse_v1:.2f}")
print(f"MAE: {mae_v1:.2f}")
print(f"R²: {r2_v1:.2f}\n")
print('Importance des variables')
print(importances_v1)

XGBoost v1 - toutes les variables, pas d'hyperparamètres
RMSE: 12373.53
MAE: 10342.58
R²: 0.70

Importance des variables
tmo                  0.690424
vac_de_pri           0.052645
tma                  0.041338
heu_cos              0.038635
is_weekend           0.035623
vac_de_noe           0.024469
top_fer              0.022924
vac_de_la_tou        0.021156
heu_sin              0.019399
vac_d_ete            0.016960
vac_d_hiv            0.014430
week_of_month_cos    0.009063
tmi                  0.007717
week_of_month_sin    0.005218
dtype: float32


### XGBoost v2
Selection des variables, pas d'hyperparamètres

In [10]:
features_v2 = [
    'tmo',
    'heu_cos',
    'heu_sin',
    'is_weekend',
    'week_of_month_cos',
    'week_of_month_sin',
    'top_fer',
    'vac_de_noe',
    'vac_d_hiv',
    'vac_de_pri',
    'vac_d_ete',
    'vac_de_la_tou'
]

In [11]:
rmse_v2, mae_v2, r2_v2, importances_v2, df_verif_v2 = test_model(
    df_gaz, features_v2, '01/01/2025', 'xgboost'
)

In [12]:
print('XGBoost v2 - Sélection des variables, pas d\'hyperparamètres')
print(f"RMSE: {rmse_v2:.2f}")
print(f"MAE: {mae_v2:.2f}")
print(f"R²: {r2_v2:.2f}\n")
print('Importance des variables')
print(importances_v2)

XGBoost v2 - Sélection des variables, pas d'hyperparamètres
RMSE: 12365.48
MAE: 10215.44
R²: 0.70

Importance des variables
tmo                  0.590915
heu_cos              0.081230
vac_de_pri           0.079747
is_weekend           0.056535
heu_sin              0.044088
top_fer              0.032419
vac_de_la_tou        0.028894
vac_de_noe           0.028723
vac_d_ete            0.017753
vac_d_hiv            0.014858
week_of_month_cos    0.013212
week_of_month_sin    0.011627
dtype: float32


### XGBoost v3
Recherche des meilleurs hyperparamètres

In [13]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'gamma': [0, 1, 5, 10] 
}

In [14]:
rmse_v3, mae_v3, r2_v3, df_verif_v3, best_params = search_params(
    df_gaz, features_v2, '01/01/2025', 'xgboost', param_grid
)

Fitting 3 folds for each of 256 candidates, totalling 768 fits


In [18]:
best_params

{'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 300}

### XGBoost final

In [None]:
rmse, mae, r2, importances, df_verif = search_params(
    df_gaz, features_v2, '01/01/2025', 'xgboost', params=best_params
)

In [None]:
print('XGBoost final')
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}\n")
print('Importance des variables')
print(importances_v2)