# Projet de MLOps
## Prédiction de la consommation anuelle d'électricité 

Auteurs: Lilou Masson, Paul Hamann Cossart

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from preparation import data_cleaning, create_dfs

## Préparation des données

In [2]:
df = pd.read_csv(
    'DATA/consommation-quotidienne-brute.csv',
    sep = ';'
)
df = data_cleaning(df)
df_gaz, df_ele = create_dfs(df)

In [3]:
df.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_gaz_tot,con_bru_ele_rte
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55353.0,59610.0
2012-01-01 00:30:00,2012-01-01,00:30:00,0.5,0.130526,0.991445,1,Sunday,1,52,1,...,0,0,0,0,1,,,,,58314.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55444.0,56230.0
2012-01-01 01:30:00,2012-01-01,01:30:00,1.5,0.382683,0.92388,1,Sunday,1,52,1,...,0,0,0,0,1,,,,,56075.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,0,0,0,0,1,,,,55465.0,55531.0


In [5]:
df_gaz.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_de_noe,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_gaz_tot
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55353.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55444.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55465.0
2012-01-01 03:00:00,2012-01-01,03:00:00,3.0,0.707107,0.707107,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56048.0
2012-01-01 04:00:00,2012-01-01,04:00:00,4.0,0.866025,0.5,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,57082.0


In [6]:
df_ele.head(5)

Unnamed: 0_level_0,dat,heu,heu_float,heu_sin,heu_cos,day,day_name,is_weekend,week,week_of_month,...,vac_de_noe,vac_d_hiv,vac_de_pri,vac_d_ete,vac_de_la_tou,top_fer,tmi,tma,tmo,con_bru_ele_rte
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,0.0,0.0,1.0,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,59610.0
2012-01-01 00:30:00,2012-01-01,00:30:00,0.5,0.130526,0.991445,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,58314.0
2012-01-01 01:00:00,2012-01-01,01:00:00,1.0,0.258819,0.965926,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56230.0
2012-01-01 01:30:00,2012-01-01,01:30:00,1.5,0.382683,0.92388,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,56075.0
2012-01-01 02:00:00,2012-01-01,02:00:00,2.0,0.5,0.866025,1,Sunday,1,52,1,...,1,0,0,0,0,1,,,,55531.0


## Modélisation

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from xgboost import XGBRegressor

In [12]:
features = [
    'heu_sin',
    'heu_cos',
    'is_weekend',
    'week_of_month_sin',
    'week_of_month_cos',
    'vac_de_noe',
    'vac_d_hiv',
    'vac_de_pri',
    'vac_d_ete',
    'vac_de_la_tou',
    'top_fer',
    'tmi',
    'tma',
    'tmo',
]
df_gaz = df_gaz.loc[df['year'] >= 2016]
debut_test = '01/01/2024'
X, y = df_gaz[features], df_gaz[['con_bru_gaz_tot']]
X_train = X.loc[X.index.normalize() < pd.to_datetime(debut_test, format='%d/%m/%Y')]
X_test = X.loc[X.index.normalize() >= pd.to_datetime(debut_test, format='%d/%m/%Y')]
y_train = y.loc[y.index.normalize() < pd.to_datetime(debut_test, format='%d/%m/%Y')]
y_test = y.loc[y.index.normalize() >= pd.to_datetime(debut_test, format='%d/%m/%Y')]

In [16]:
model = XGBRegressor(
    random_state=42,
)

model.fit(X_train, y_train)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [18]:
y_pred = model.predict(X_test)

In [23]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

RMSE: 12587.837939852896
MAE: 10671.3134765625
R²: 0.6916842460632324


In [21]:
importances = pd.Series(model.feature_importances_, index=features)

# Trier par ordre décroissant
importances = importances.sort_values(ascending=False)

# Afficher
print(importances)

tmo                  0.642869
tma                  0.085546
vac_de_pri           0.045278
is_weekend           0.037012
top_fer              0.035554
heu_cos              0.033246
vac_de_la_tou        0.025278
vac_de_noe           0.021151
vac_d_hiv            0.020157
heu_sin              0.019423
vac_d_ete            0.013136
week_of_month_cos    0.009494
tmi                  0.006798
week_of_month_sin    0.005056
dtype: float32


In [24]:
df_verif = y_pred.copy()
df_verif['y_true'] = y_test.copy()
df_verif

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [27]:
df_verif = y_test.copy()
df_verif['y_pred'] = y_pred

In [28]:
df_verif

Unnamed: 0_level_0,con_bru_gaz_tot,y_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-01 00:00:00,43318.0,61524.410156
2024-01-01 01:00:00,43107.0,61472.261719
2024-01-01 02:00:00,43134.0,61472.261719
2024-01-01 03:00:00,44077.0,60821.679688
2024-01-01 04:00:00,46475.0,63452.632812
...,...,...
2025-11-30 19:00:00,68204.0,68833.093750
2025-11-30 20:00:00,66724.0,69575.500000
2025-11-30 21:00:00,65578.0,67469.460938
2025-11-30 22:00:00,61165.0,64051.824219
