# Train models
Fichier pour l'entrainement et le test des modèles.

In [20]:
# Import libraries
import os
import pandas as pd
import joblib
import plotly.express as px
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle


## Chargement des données

In [2]:
df_spot = pd.read_csv('data/France.csv')
df_spot["Datetime (Local)"] = pd.to_datetime(df_spot["Datetime (Local)"])
display(df_spot)

Unnamed: 0,Country,ISO3 Code,Datetime (UTC),Datetime (Local),Price (EUR/MWhe)
0,France,FRA,2015-01-01 00:00:00,2015-01-01 01:00:00,36.56
1,France,FRA,2015-01-01 01:00:00,2015-01-01 02:00:00,36.56
2,France,FRA,2015-01-01 02:00:00,2015-01-01 03:00:00,36.56
3,France,FRA,2015-01-01 03:00:00,2015-01-01 04:00:00,36.56
4,France,FRA,2015-01-01 04:00:00,2015-01-01 05:00:00,36.56
...,...,...,...,...,...
94576,France,FRA,2025-10-17 04:00:00,2025-10-17 06:00:00,83.82
94577,France,FRA,2025-10-17 05:00:00,2025-10-17 07:00:00,96.45
94578,France,FRA,2025-10-17 06:00:00,2025-10-17 08:00:00,107.99
94579,France,FRA,2025-10-17 07:00:00,2025-10-17 09:00:00,102.38


In [3]:
df_eco = pd.read_csv('data/eCO2mix_RTE_En-cours-Consolide.xls', sep='\t', encoding='latin1',index_col=False)
df_eco['Datetime'] = pd.to_datetime(df_eco['Date'] + ' ' + df_eco['Heures'])
df_eco = df_eco.dropna()
display(df_eco)

Unnamed: 0,Périmètre,Nature,Date,Heures,Consommation,Prévision J-1,Prévision J,Fioul,Charbon,Gaz,...,Hydraulique - Lacs,Hydraulique - STEP turbinage,Bioénergies - Déchets,Bioénergies - Biomasse,Bioénergies - Biogaz,Stockage batterie,Déstockage batterie,Eolien terrestre,Eolien offshore,Datetime
0,France,Données consolidées,2024-01-01,00:00,55239.0,55000.0,54200.0,96.0,18.0,1975.0,...,1553.0,117.0,510.0,354.0,347.0,0.0,0.0,14976.0,581.0,2024-01-01 00:00:00
2,France,Données consolidées,2024-01-01,00:30,54560.0,53600.0,52600.0,96.0,17.0,1919.0,...,1569.0,6.0,510.0,355.0,347.0,0.0,0.0,14880.0,579.0,2024-01-01 00:30:00
4,France,Données consolidées,2024-01-01,01:00,52689.0,52800.0,51300.0,95.0,17.0,2000.0,...,1179.0,6.0,513.0,359.0,344.0,0.0,0.0,14773.0,582.0,2024-01-01 01:00:00
6,France,Données consolidées,2024-01-01,01:30,52917.0,53000.0,51700.0,96.0,17.0,1993.0,...,1136.0,5.0,510.0,359.0,344.0,0.0,0.0,14607.0,581.0,2024-01-01 01:30:00
8,France,Données consolidées,2024-01-01,02:00,52770.0,52300.0,52000.0,96.0,17.0,2001.0,...,1071.0,5.0,515.0,362.0,344.0,0.0,0.0,14409.0,579.0,2024-01-01 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35126,France,Données consolidées,2024-12-31,21:30,63574.0,61600.0,62000.0,89.0,16.0,1850.0,...,559.0,125.0,542.0,372.0,342.0,0.0,0.0,8770.0,1184.0,2024-12-31 21:30:00
35128,France,Données consolidées,2024-12-31,22:00,62706.0,60800.0,61100.0,89.0,17.0,1853.0,...,359.0,78.0,545.0,374.0,341.0,0.0,0.0,8983.0,1180.0,2024-12-31 22:00:00
35130,France,Données consolidées,2024-12-31,22:30,63882.0,61700.0,61900.0,90.0,17.0,2029.0,...,361.0,123.0,549.0,370.0,340.0,0.0,0.0,9000.0,1250.0,2024-12-31 22:30:00
35132,France,Données consolidées,2024-12-31,23:00,65321.0,64200.0,64300.0,90.0,17.0,2133.0,...,762.0,155.0,549.0,369.0,339.0,0.0,0.0,9160.0,1301.0,2024-12-31 23:00:00


## Préparation des données pour l'entrainement

In [4]:
df_model = pd.merge(df_eco, df_spot, left_on='Datetime', right_on = "Datetime (Local)", how='inner')
df_model = df_model.sort_values(by='Datetime')

features = [
       'Consommation',
       'Prévision J-1', 
       'Prévision J', 
       'Fioul', 
       'Charbon', 
       'Gaz', 
       'Nucléaire',
       'Eolien', 
       'Solaire', 
       'Hydraulique', 
       'Pompage', 
       'Bioénergies',
       'Ech. physiques', 
       'Taux de Co2', 
       'Ech. comm. Angleterre',
       'Ech. comm. Espagne', 
       'Ech. comm. Italie', 
       'Ech. comm. Suisse',
       'Ech. comm. Allemagne-Belgique', 
       'Fioul - TAC', 'Fioul - Cogén.',
       'Fioul - Autres', 'Gaz - TAC', 
       'Gaz - Cogén.', 'Gaz - CCG',
       'Gaz - Autres', 'Hydraulique - Fil de l?eau + éclusée',
       'Hydraulique - Lacs', 
       'Hydraulique - STEP turbinage',
       'Bioénergies - Déchets', 
       'Bioénergies - Biomasse',
       'Bioénergies - Biogaz', 
       ' Stockage batterie', 
       'Déstockage batterie',
       'Eolien terrestre', 
       'Eolien offshore']

In [5]:
if False : # Set True pour la sélection des features basée sur la corrélation avec la variable à prédire
    df_corr = df_model.select_dtypes(include=['number'])
    corr = df_corr.corr()
    corr_target = corr['Price (EUR/MWhe)'].drop('Price (EUR/MWhe)').sort_values(ascending=False)

    plt.figure(figsize=(6, len(corr_target) * 0.4 + 1))
    sns.heatmap(corr_target.to_frame(), annot=True, cmap='coolwarm', center=0)
    plt.title("Corrélation de toutes les variables avec la cible")
    plt.show()

    selected_features = corr_target[abs(corr_target) > 0.1]
    df_model = df_model[list(selected_features.index) + ["Price (EUR/MWhe)","Datetime"]]
    features = selected_features.index
    print("Variables avec |corr| > 0.1 :")
    print(selected_features)


In [6]:
# Séparation chronologique : 90% train, 10% test
split_idx = int(0.80 * len(df_model))
df_train = df_model.iloc[:split_idx]
df_test = df_model.iloc[split_idx:]

# Préparation des features et target
X_train = df_train[features]
y_train = df_train['Price (EUR/MWhe)']
X_test = df_test[features]
y_test = df_test['Price (EUR/MWhe)']

# StandardScaler 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # X_train # scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) #X_test # scaler.transform(X_test)

X_train_scaled = pd.DataFrame(
    X_train_scaled,              
    columns=X_train.columns    
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,              
    columns=X_test.columns    
)

## Train models

In [7]:
# Régression linéaire
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)

In [8]:
# Régression LASSO
model_l = make_pipeline(StandardScaler(), LassoCV(cv=5))
model_l.fit(X_train_scaled, y_train)

coef = pd.Series(model_l.named_steps['lassocv'].coef_, index=X_train_scaled.columns)
print(coef.sort_values(ascending=False))

Taux de Co2                             43.394798
Nucléaire                               12.178009
Hydraulique - Lacs                      10.722079
Prévision J-1                            9.042433
Pompage                                  5.131218
Ech. comm. Suisse                        5.093546
Hydraulique - STEP turbinage             2.658993
Gaz - Cogén.                             1.921412
Ech. comm. Espagne                       1.538396
Bioénergies - Biogaz                     0.331229
Déstockage batterie                      0.000000
 Stockage batterie                       0.000000
Prévision J                             -0.000000
Gaz                                     -0.000000
Hydraulique                              0.000000
Eolien terrestre                        -0.171918
Consommation                            -0.380552
Gaz - TAC                               -0.382757
Bioénergies - Déchets                   -0.635517
Fioul - Autres                          -0.671182


  model = cd_fast.enet_coordinate_descent(


In [9]:
# Random Forest
# important_features = ['Nucléaire', 'Gaz - CCG', 'Pompage', 'Hydraulique - Lacs',
#        'Ech. comm. Suisse', 'Hydraulique - Fil de l?eau + éclusée',
#        'Hydraulique - STEP turbinage', 'Gaz - Cogén.', 'Fioul - TAC',
#        'Ech. comm. Angleterre', 'Gaz - TAC', 'Ech. comm. Allemagne-Belgique',
#        'Fioul - Cogén.', 'Gaz - Autres', 'Gaz']
important_features = X_train_scaled.columns
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train_scaled[important_features], y_train)

In [10]:
# # Feature importance
# importances = pd.Series(model_rf.feature_importances_, index=X_train_scaled.columns)
# # print(importances.sort_values(ascending=False))
# importances.nlargest(15).plot(kind='barh')
# important_features = importances.nlargest(15).index

In [None]:
# # Save models
# with open('models/linear_regression.pkl', 'wb') as f:
#     pickle.dump(model_lr, f)

# with open('models/lasso_regression.pkl', 'wb') as f:
#     pickle.dump(model_l, f)

# with open('models/random_forest.pkl', 'wb') as f:
#     pickle.dump(model_rf, f)

# joblib.dump(scaler, 'models/scaler.joblib')


['models/scaler.joblib']

## Evaluate models

In [12]:
def plot_prediction_reality(df_plot):
    df_plot = df_plot.sort_values(by='Datetime')

    fig = px.line(df_plot,
                x='Datetime',
                y=['Prix prédit', 'Prix réel'],
                title='Comparaison des prix prédits et réels au cours du temps',
                labels={'value': 'Prix', 'variable': 'Type'},
                color_discrete_map={'Prix prédit': 'red', 'Prix réel': 'blue'})

    fig.update_layout(
        xaxis_title='Date et heure',
        yaxis_title='Prix',
        legend_title='Légende',
        hovermode='x unified'
    )
    fig.show()
    
def eval_model(model,X_train,X_test):
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    rmse = mean_squared_error(y_test, y_pred_test, squared=False)
    r2 = r2_score(y_test, y_pred_test)
    
    # print(f"RMSE Linear Regression (test): {rmse:.3f}")
    print(f"R² Linear Regression (test): {r2:.3f}")
    
    rmse = mean_squared_error(y_train, y_pred_train, squared=False)
    r2 = r2_score(y_train, y_pred_train)
    
    # print(f"RMSE Linear Regression (train): {rmse:.3f}")
    print(f"R² Linear Regression (train): {r2:.3f}")
    
    df_plot_train = pd.DataFrame({
        "Datetime":df_train["Datetime"],
        "Prix prédit":y_pred_train,
        "Prix réel":y_train
    })

    df_plot_test = pd.DataFrame({
        "Datetime":df_test["Datetime"],
        "Prix prédit":y_pred_test,
        "Prix réel":y_test
    })

    # plot_prediction_reality(df_plot_train)
    plot_prediction_reality(df_plot_test)


In [13]:
# Régression linéaire
eval_model(model_lr,X_train_scaled,X_test_scaled)


R² Linear Regression (test): 0.296
R² Linear Regression (train): 0.740


In [14]:
# Modèle Lasso
eval_model(model_l,X_train_scaled,X_test_scaled)

R² Linear Regression (test): 0.331
R² Linear Regression (train): 0.737


In [15]:
# Random Forests
eval_model(model_rf,X_train_scaled[important_features],X_test_scaled[important_features])

R² Linear Regression (test): 0.212
R² Linear Regression (train): 0.983


## Inférence sur les données chargées de l'API

In [16]:
import requests
import pandas as pd

# URL de base de l'API OpenDataSoft
url = "https://odre.opendatasoft.com/api/records/1.0/search/"

# Paramètres pour récupérer les 100 dernières données triées par date
params = {
    "dataset": "eco2mix-national-tr",  
    "rows": 10000,                       
    "sort": "-date",                    
    "facet": "nature"                  
}

# Appel à l'API
response = requests.get(url, params=params)

# Vérification du succès de la requête
if response.status_code == 200:
    data_json = response.json()
    print("Nombre d'enregistrements récupérés :", len(data_json.get("records", [])))
else:
    print("Erreur lors de l'appel à l'API :", response.status_code)

# Conversion des données en DataFrame
records = data_json.get("records", [])
df = pd.json_normalize(records)  # aplatit la structure JSON

# Affichage des premières lignes
df = pd.DataFrame(df)
df = df.dropna()
display(df.head())


Nombre d'enregistrements récupérés : 10000


Unnamed: 0,datasetid,recordid,record_timestamp,fields.nature,fields.heure,fields.date_heure,fields.date,fields.perimetre,fields.prevision_j1,fields.prevision_j,...,fields.ech_comm_suisse,fields.eolien_offshore,fields.ech_comm_espagne,fields.taux_co2,fields.eolien,fields.ech_physiques,fields.bioenergies_dechets,fields.consommation,fields.gaz,fields.fioul_autres
192,eco2mix-national-tr,55abc3e43aacbdd28e1456fc4bb7647fd9f0da11,2025-10-29T20:15:00.396Z,Données temps réel,02:30,2025-10-29T01:30:00+00:00,2025-10-29,France,43000.0,43100.0,...,-2385.0,712,-2100.0,14.0,5789.0,-13669.0,410.0,42673.0,618.0,0.0
193,eco2mix-national-tr,69bbcce02dc2e3729aa2053e1abe7ae56bbc5f83,2025-10-29T20:15:00.396Z,Données temps réel,03:30,2025-10-29T02:30:00+00:00,2025-10-29,France,40600.0,41100.0,...,-2385.0,739,-2100.0,12.0,5667.0,-13073.0,410.0,40717.0,394.0,0.0
194,eco2mix-national-tr,25ccfac303fbceedba0cf3d62b29ac7066b33e49,2025-10-29T20:15:00.396Z,Données temps réel,06:15,2025-10-29T05:15:00+00:00,2025-10-29,France,45300.0,45850.0,...,-2385.0,554,-2100.0,11.0,5779.0,-13950.0,400.0,45756.0,339.0,0.0
195,eco2mix-national-tr,530dcb77aa5fcb48cbb70a590a3810807ca57eef,2025-10-29T20:15:00.396Z,Données temps réel,10:15,2025-10-29T09:15:00+00:00,2025-10-29,France,53850.0,53600.0,...,-2385.0,164,-2100.0,13.0,5531.0,-10740.0,400.0,53688.0,756.0,0.0
196,eco2mix-national-tr,7ece7d4beca2af04200fc22b651ede45b258abea,2025-10-29T20:15:00.396Z,Données temps réel,10:30,2025-10-29T09:30:00+00:00,2025-10-29,France,54000.0,53600.0,...,-2385.0,130,-2069.0,14.0,5380.0,-11119.0,402.0,53624.0,887.0,0.0


In [17]:
# Transformation du df récupéré en input du modèle
df = df.rename(columns={
    'fields.heure': 'Heures',
    'fields.date': 'Date',
    'fields.prevision_j1': 'Prévision J-1',
    'fields.prevision_j': 'Prévision J',
    'fields.fioul_tac': 'Fioul - TAC',
    'fields.nucleaire': 'Nucléaire',
    'fields.ech_comm_angleterre': 'Ech. comm. Angleterre',
    'fields.gaz_tac': 'Gaz - TAC',
    'fields.bioenergies_biogaz': 'Bioénergies - Biogaz',
    'fields.bioenergies_biomasse': 'Bioénergies - Biomasse',
    'fields.destockage_batterie': 'Déstockage batterie',
    'fields.charbon': 'Charbon',
    'fields.hydraulique_step_turbinage': 'Hydraulique - STEP turbinage',
    'fields.stockage_batterie': ' Stockage batterie',
    'fields.pompage': 'Pompage',
    'fields.gaz_ccg': 'Gaz - CCG',
    'fields.hydraulique_lacs': 'Hydraulique - Lacs',
    'fields.eolien_terrestre': 'Eolien terrestre',
    'fields.ech_comm_allemagne_belgique': 'Ech. comm. Allemagne-Belgique',
    'fields.fioul': 'Fioul',
    'fields.solaire': 'Solaire',
    'fields.ech_comm_italie': 'Ech. comm. Italie',
    'fields.bioenergies': 'Bioénergies',
    'fields.gaz_autres': 'Gaz - Autres',
    'fields.fioul_cogen': 'Fioul - Cogén.',
    'fields.gaz_cogen': 'Gaz - Cogén.',
    'fields.hydraulique_fil_eau_eclusee': 'Hydraulique - Fil de l?eau + éclusée',
    'fields.hydraulique': 'Hydraulique',
    'fields.ech_comm_suisse': 'Ech. comm. Suisse',
    'fields.eolien_offshore': 'Eolien offshore',
    'fields.ech_comm_espagne': 'Ech. comm. Espagne',
    'fields.taux_co2': 'Taux de Co2',
    'fields.eolien': 'Eolien',
    'fields.ech_physiques': 'Ech. physiques',
    'fields.bioenergies_dechets': 'Bioénergies - Déchets',
    'fields.consommation': 'Consommation',
    'fields.gaz': 'Gaz',
    'fields.fioul_autres': 'Fioul - Autres'
})

In [18]:
# Jointure des données Eco2Mix avec Prix
df_eval = df[features]
df_eval["Datetime"] = pd.to_datetime(df['Date'] + ' ' + df['Heures'])
print(len(df_eval))
df_eval = pd.merge(df_eval, df_spot, left_on='Datetime', right_on = "Datetime (Local)", how='inner')

9792




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
# Préparation des données d'entrée pour le modèle
X_validation = df_eval[features]
X_validation_scaled = scaler.transform(X_validation) # X_validation # scaler.fit_transform(X_validation)
y_true = df_eval["Price (EUR/MWhe)"]

# Chargement des modèles
with open(f"models/linear_regression.pkl", "rb") as file:
    model_lr = pickle.load(file)
    
with open(f"models/linear_regression.pkl", "rb") as file:
    model_lr = pickle.load(file)

with open(f"models/random_forest.pkl", "rb") as file:
    model_rf = pickle.load(file)

with open(f"models/scaler.joblib", "rb") as file:
    scaler = joblib.load(file)


In [23]:
y_pred_lr = model_lr.predict(X_validation_scaled)
y_pred_rf = model_rf.predict(X_validation_scaled)



X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names



In [24]:
print("RMSE Linear Regression:", mean_squared_error(y_true, y_pred_lr, squared=False))
print("RMSE Random Forest:", mean_squared_error(y_true, y_pred_rf, squared=False))


RMSE Linear Regression: 37.09423520042283
RMSE Random Forest: 23.80990622715694


In [25]:
df_plot_lr = pd.DataFrame({
    "Datetime":df_eval["Datetime"],
    "Prix prédit":y_pred_lr,
    "Prix réel":y_true
})

df_plot_rf = pd.DataFrame({
    "Datetime":df_eval["Datetime"],
    "Prix prédit":y_pred_rf,
    "Prix réel":y_true
})

# Modèle régression linéaire
plot_prediction_reality(df_plot_lr)

# Modèle random forest
plot_prediction_reality(df_plot_rf)
