In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("../output").resolve()
TABLEAUX_SYNTHESE_PATH = OUTPUT_DIR / "tableaux_synthese"
CONSOMMATIONS_ENRICHIES_PARQUET_PATH = OUTPUT_DIR / "consommations_enrichies"
CONSOMMATIONS_ENRICHIES_CSV_PATH = OUTPUT_DIR / "consommations_enrichies.csv"

In [2]:
df = pd.read_parquet(CONSOMMATIONS_ENRICHIES_PARQUET_PATH.as_posix())
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = pd.to_datetime(df['date'])
df['date_debut'] = pd.to_datetime(df['date'])
df['date_fin'] = pd.to_datetime(df['date'])
df['type_energie'] = df['type_energie'].astype("str")

df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7492584 entries, 0 to 7492583
Data columns (total 37 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   batiment_id                    object        
 1   timestamp                      datetime64[ns]
 2   consommation                   float64       
 3   unite                          object        
 4   hour                           int32         
 5   year                           int32         
 6   month                          int32         
 7   date                           datetime64[ns]
 8   nom                            object        
 9   type                           object        
 10  commune                        object        
 11  surface_m2                     int32         
 12  annee_construction             int32         
 13  classe_energetique             object        
 14  nb_occupants_moyen             int32         
 15  intensite_energ

Unnamed: 0,batiment_id,timestamp,consommation,unite,hour,year,month,date,nom,type,...,consommation_par_occupant,consommation_par_m2,consommation_journaliere,cout_journalier,cout_mensuel,cout_annuel,consommation_annuelle,IPE,consommation_moyenne_par_type,ecart_conso_moyenne_type
0,BAT0056,2024-01-13 08:00:00,5.23,m3,8,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.038741,0.00335,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.68662
1,BAT0001,2024-01-13 04:00:00,0.22,m3,4,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.000978,0.000114,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.028883
2,BAT0056,2024-01-13 17:00:00,3.42,m3,17,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.025333,0.002191,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.448994
3,BAT0001,2024-01-13 09:00:00,2.3,m3,9,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.010222,0.001194,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.301955
4,BAT0056,2024-01-13 19:00:00,3.68,m3,19,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.027259,0.002357,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.483128
5,BAT0001,2024-01-13 10:00:00,3.81,m3,10,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.016933,0.001978,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.500195
6,BAT0057,2024-01-13 07:00:00,0.82,m3,7,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.023429,0.000728,0.961304,3.604891,9985.05,115972.275,30925.94,27.465311,2.727374,0.300656
7,BAT0001,2024-01-13 13:00:00,2.68,m3,13,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.011911,0.001391,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.351843
8,BAT0057,2024-01-13 11:00:00,1.36,m3,11,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.038857,0.001208,0.961304,3.604891,9985.05,115972.275,30925.94,27.465311,2.727374,0.498648
9,BAT0002,2024-01-13 09:00:00,1.49,m3,9,2024,1,2024-01-13,Ecole Paris 2,ecole,...,0.003706,0.001289,0.859091,3.221591,9279.5625,104863.2375,27963.53,24.189905,7.617028,0.195614


#### Etape 3.3 : Detection d'anomalies
- Identifier les pics de consommation anormaux (>3 ecarts-types)

In [3]:
std_conso = df.groupby(["batiment_id", "type_energie"]).agg(consommation_std=("consommation", "std"), consommation_mean=("consommation", "mean"))

df_anomalie = df.join(std_conso, ["batiment_id", "type_energie"], "left")

df_pics_conso = df_anomalie.loc[df_anomalie["consommation"] > df_anomalie["consommation_mean"] + 3 * df_anomalie["consommation_std"]]

print(f"  Nombre de pics de consommation : {len(df_pics_conso)}")

  Nombre de pics de consommation : 5258


- Detecter les periodes de sous-consommation suspectes (batiment ferme non declare)

In [4]:
df_sous_conso = df_anomalie.loc[df_anomalie["consommation"] < df_anomalie["consommation_mean"] - 1.49 * df_anomalie["consommation_std"]]

print(f"  Nombre de sous consommation (inférieure à la moyenne moins 1.49 écart type) : {len(df_sous_conso)}")

  Nombre de sous consommation (inférieure à la moyenne moins 1.49 écart type) : 37


- Reperer les batiments dont la consommation ne correspond pas a leur DPE

In [5]:
def trouver_classe_energetique(ipe: float) -> str:
    if ipe < 70:
        return "A"
    elif ipe < 110:
        return "B"
    elif ipe < 180:
        return "C"
    elif ipe < 250:
        return "D"
    elif ipe < 330:
        return "E"
    elif ipe < 420:
        return "F"
    return "G"

classe_energie = df.loc[df["type_energie"].isin(["gaz", "electricite"])].copy()
classe_energie = (
    classe_energie
    .groupby(["batiment_id", "nom", "type", "commune", "classe_energetique", "type_energie"])
    .agg(IPE=("IPE", "first"))
    .reset_index()
)
classe_energie = (
    classe_energie
    .groupby(["batiment_id", "nom", "type", "commune", "classe_energetique"])
    .agg(IPE=("IPE", "sum"))
    .reset_index()
)
classe_energie["classe_energetique_reelle"] = classe_energie["IPE"].apply(lambda x: trouver_classe_energetique(x))

dpe_errone = classe_energie.loc[classe_energie["classe_energetique_reelle"] != classe_energie["classe_energetique"]]
dpe_errone.head(10)

Unnamed: 0,batiment_id,nom,type,commune,classe_energetique,IPE,classe_energetique_reelle
0,BAT0001,Ecole Paris 1,ecole,Paris,E,1567.996101,G
1,BAT0002,Ecole Paris 2,ecole,Paris,C,1085.332647,G
2,BAT0003,Ecole Paris 3,ecole,Paris,D,1205.882124,G
3,BAT0004,Mediatheque Paris 4,mediatheque,Paris,C,1191.847365,G
5,BAT0006,Mairie Paris 6,mairie,Paris,E,1180.977562,G
6,BAT0007,Gymnase Paris 7,gymnase,Paris,C,1763.649884,G
8,BAT0009,Ecole Lyon 9,ecole,Lyon,B,847.573068,G
11,BAT0012,Mairie Lyon 12,mairie,Lyon,F,1548.070731,G
12,BAT0013,Gymnase Lyon 13,gymnase,Lyon,D,1953.540345,G
14,BAT0015,Mairie Lyon 15,mairie,Lyon,F,1541.633999,G


Le calcul de l'IPE étant faux, le DPE réel n'a pas de sens ici.

- Lister les batiments necessitant un audit energetique

In [6]:
df_pics_conso = (
    df_pics_conso
    .groupby(["batiment_id", "nom", "type", "commune", "classe_energetique"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
    .reset_index()
)

print(f"  Nombre de bâtiment avec pics de consommation anormale : {len(df_pics_conso)}")
print(df_pics_conso.head(5))
print()

df_sous_conso = (
    df_sous_conso
    .groupby(["batiment_id", "nom", "type", "commune", "classe_energetique"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
    .reset_index()
)

print(f"  Nombre de bâtiment sous consommation anormale : {len(df_sous_conso)}")
print(df_sous_conso.head(5))
print()

print(f"  Nombre de bâtiment avec DPE erroné : {len(dpe_errone)}")
print(dpe_errone.head(5))
print()

  Nombre de bâtiment avec pics de consommation anormale : 58
  batiment_id             nom    type commune classe_energetique  \
0     BAT0001   Ecole Paris 1   ecole   Paris                  E   
1     BAT0002   Ecole Paris 2   ecole   Paris                  C   
2     BAT0003   Ecole Paris 3   ecole   Paris                  D   
3     BAT0006  Mairie Paris 6  mairie   Paris                  E   
4     BAT0009    Ecole Lyon 9   ecole    Lyon                  B   

   consommation_annuelle  
0             1209940.44  
1              498641.66  
2              818575.26  
3              552840.06  
4              797137.96  

  Nombre de bâtiment sous consommation anormale : 1
  batiment_id                 nom     type commune classe_energetique  \
0     BAT0146  Piscine Toulon 146  piscine  Toulon                  F   

   consommation_annuelle  
0             1544051.93  

  Nombre de bâtiment avec DPE erroné : 106
  batiment_id                  nom         type commune classe_energet

In [7]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
df_pics_conso.to_csv(OUTPUT_DIR / "pics_consommation.csv")
df_sous_conso.to_csv(OUTPUT_DIR / "sous_consommation.csv")
dpe_errone.to_csv(OUTPUT_DIR / "dpe_errone.csv")

**Livrables** :
- Notebook `08_detection_anomalies.ipynb`
- Liste des anomalies `output/anomalies_detectees.csv`
- Rapport de recommandations d'audit