In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("../output").resolve()
TABLEAUX_SYNTHESE_PATH = OUTPUT_DIR / "tableaux_synthese"
CONSOMMATIONS_ENRICHIES_PARQUET_PATH = OUTPUT_DIR / "consommations_enrichies"
CONSOMMATIONS_ENRICHIES_CSV_PATH = OUTPUT_DIR / "consommations_enrichies.csv"
MATRICE_CORR_PATH = OUTPUT_DIR / "matrice_correlation.csv"

In [2]:
df = pd.read_parquet(CONSOMMATIONS_ENRICHIES_PARQUET_PATH.as_posix())
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = pd.to_datetime(df['date'])
df['date_debut'] = pd.to_datetime(df['date'])
df['date_fin'] = pd.to_datetime(df['date'])
df['type_energie'] = df['type_energie'].astype("str")

df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7492584 entries, 0 to 7492583
Data columns (total 37 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   batiment_id                    object        
 1   timestamp                      datetime64[ns]
 2   consommation                   float64       
 3   unite                          object        
 4   hour                           int32         
 5   year                           int32         
 6   month                          int32         
 7   date                           datetime64[ns]
 8   nom                            object        
 9   type                           object        
 10  commune                        object        
 11  surface_m2                     int32         
 12  annee_construction             int32         
 13  classe_energetique             object        
 14  nb_occupants_moyen             int32         
 15  intensite_energ

Unnamed: 0,batiment_id,timestamp,consommation,unite,hour,year,month,date,nom,type,...,consommation_par_occupant,consommation_par_m2,consommation_journaliere,cout_journalier,cout_mensuel,cout_annuel,consommation_annuelle,IPE,consommation_moyenne_par_type,ecart_conso_moyenne_type
0,BAT0056,2024-01-13 08:00:00,5.23,m3,8,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.038741,0.00335,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.68662
1,BAT0001,2024-01-13 04:00:00,0.22,m3,4,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.000978,0.000114,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.028883
2,BAT0056,2024-01-13 17:00:00,3.42,m3,17,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.025333,0.002191,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.448994
3,BAT0001,2024-01-13 09:00:00,2.3,m3,9,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.010222,0.001194,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.301955
4,BAT0056,2024-01-13 19:00:00,3.68,m3,19,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.027259,0.002357,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.483128


#### Etape 3.2 : Analyse des correlations
- Calculer la matrice de correlation entre :
  - Consommations (electricite, gaz, eau)
  - Variables meteo (temperature, humidite, rayonnement, vent)
  - Caracteristiques batiments (surface, nb occupants, annee construction)

In [3]:
df_pivot = df.pivot_table(
    values="consommation",
    index=["timestamp", "commune"],
    columns="type_energie",
    aggfunc="mean"
).reset_index()

meteo_cols = ["temperature_c", "humidite_pct", "rayonnement_solaire_wm2", "vitesse_vent_kmh"]
df_meteo_unique = df[["timestamp", "commune"] + meteo_cols].drop_duplicates()

df_corr = df_pivot.merge(df_meteo_unique, on=['timestamp', 'commune'], how='left')

print(f"Dataset pour correlation: {len(df_corr):,} lignes")
df_corr.head()

Dataset pour correlation: 263,160 lignes


Unnamed: 0,timestamp,commune,eau,electricite,gaz,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh
0,2023-01-01,Bordeaux,3.3875,37.43125,24.268571,8.5,43.9,0.8,0.2
1,2023-01-01,Le Havre,9.607,64.305455,85.509091,0.8,45.8,31.2,21.5
2,2023-01-01,Lille,9.31875,77.525,95.36,-3.1,65.8,5.2,25.6
3,2023-01-01,Lyon,3.0625,38.314545,60.568333,10.7,72.1,36.1,14.8
4,2023-01-01,Marseille,3.573636,35.808182,44.269091,6.4,88.7,36.9,3.9


In [4]:
type_energie_cols = ["eau", "electricite", "gaz"]
available_type_energie = [m for m in type_energie_cols if m in df_corr.columns]
meteo_cols = ["temperature_c", "humidite_pct", "rayonnement_solaire_wm2", "vitesse_vent_kmh"]
available_meteo = [m for m in meteo_cols if m in df_corr.columns]

corr_cols = available_type_energie + available_meteo

correlation_matrix = df_corr[corr_cols].corr()

print("Matrice de correlation:")
correlation_matrix

Matrice de correlation:


Unnamed: 0,eau,electricite,gaz,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh
eau,1.0,0.822812,0.823187,0.012106,0.002963,0.387102,-0.001634
electricite,0.822812,1.0,0.977617,-0.283551,0.003378,0.412396,-0.00172
gaz,0.823187,0.977617,1.0,-0.283795,0.00326,0.411438,-0.001827
temperature_c,0.012106,-0.283551,-0.283795,1.0,5.8e-05,0.003787,-0.000441
humidite_pct,0.002963,0.003378,0.00326,5.8e-05,1.0,-0.000404,-0.000189
rayonnement_solaire_wm2,0.387102,0.412396,0.411438,0.003787,-0.000404,1.0,0.000216
vitesse_vent_kmh,-0.001634,-0.00172,-0.001827,-0.000441,-0.000189,0.000216,1.0


- Identifier les correlations significatives (>0.5 ou <-0.5)

In [5]:
def trouver_correleations_fortes(matrice_corr: pd.DataFrame, seuil: float = 0.5):
    paires = []
    cols = matrice_corr.columns
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            r = matrice_corr.iloc[i, j]
            if abs(r) >= seuil:
                paires.append(
                    {
                        "Variable_1": cols[i],
                        "Variable_2": cols[j],
                        "Corrélation": r,
                    }
                )
    resultats = pd.DataFrame(paires)
    resultats_trie = resultats.sort_values(
        "Corrélation",
        ascending=False,
        key=abs,
    )
    return resultats_trie


correlations_fortes = trouver_correleations_fortes(correlation_matrix, seuil=0.5)

print("  Corrélation fortes :")
print(correlations_fortes.to_string(index=False))

  Corrélation fortes :
 Variable_1  Variable_2  Corrélation
electricite         gaz     0.977617
        eau         gaz     0.823187
        eau electricite     0.822812


- Analyser l'impact de la temperature sur la consommation de chauffage

In [6]:
temp_gaz = correlation_matrix["temperature_c"]["gaz"]
temp_electricite = correlation_matrix["temperature_c"]["electricite"]

def sens_correlation(corr: float):
    if corr <= 0:
        print("  Corrélation négative")
    else:
        print("  Corrélation positve")

def force_correslation(corr: float):
    if corr > 0.7:
        print("  Corrélation forte")
    elif corr > 0.5:
        print("  Corrélation moyenne")
    else:
        print("  Corrélation faible")

print(f"  Corrélation entre température et gaz : {temp_gaz}")
sens_correlation(temp_gaz)
force_correslation(temp_gaz)
print()
print(f"  Corrélation entre température et électricité : {temp_electricite}")
sens_correlation(temp_electricite)
force_correslation(temp_electricite)

  Corrélation entre température et gaz : -0.28379507074360993
  Corrélation négative
  Corrélation faible

  Corrélation entre température et électricité : -0.28355118912847466
  Corrélation négative
  Corrélation faible


Les corrélations entre tempétature et gaz et entre température et électricité sont négatives, ce qui indique que lorsque la température augmente, la consommation énergétique dimunue et vice versa. Les corrélations sont cependant faible 0.28 < 0.5.

- Etudier l'effet du rayonnement solaire sur la consommation electrique

In [7]:
rayonnement_electricite = correlation_matrix["rayonnement_solaire_wm2"]["electricite"]

print(f"  Corrélation entre rayonnement solaire et électricité : {rayonnement_electricite}")
sens_correlation(rayonnement_electricite)
force_correslation(rayonnement_electricite)

  Corrélation entre rayonnement solaire et électricité : 0.4123955828913286
  Corrélation positve
  Corrélation faible


Les corrélations entre rayonnement solaire et électrivité et entre température est positive, ce qui indique que lorsque la rayonnement solaire augmente, la consommation électrique augmente et vice versa. Les corrélations sont cependant faible 0.41 < 0.5.

In [8]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
correlation_matrix.to_csv(MATRICE_CORR_PATH.as_posix())

**Livrables** :
- Notebook `07_analyse_correlations.ipynb`
- Matrice de correlation exportee `output/matrice_correlation.csv`
- Synthese des insights (format markdown)