In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("../output").resolve()
TABLEAUX_SYNTHESE_PATH = OUTPUT_DIR / "tableaux_synthese"
CONSOMMATIONS_ENRICHIES_PARQUET_PATH = OUTPUT_DIR / "consommations_enrichies"
CONSOMMATIONS_ENRICHIES_CSV_PATH = OUTPUT_DIR / "consommations_enrichies.csv"

In [2]:
df = pd.read_parquet(CONSOMMATIONS_ENRICHIES_PARQUET_PATH.as_posix())
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = pd.to_datetime(df['date'])
df['date_debut'] = pd.to_datetime(df['date'])
df['date_fin'] = pd.to_datetime(df['date'])
df['type_energie'] = df['type_energie'].astype("str")

df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7492584 entries, 0 to 7492583
Data columns (total 37 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   batiment_id                    object        
 1   timestamp                      datetime64[ns]
 2   consommation                   float64       
 3   unite                          object        
 4   hour                           int32         
 5   year                           int32         
 6   month                          int32         
 7   date                           datetime64[ns]
 8   nom                            object        
 9   type                           object        
 10  commune                        object        
 11  surface_m2                     int32         
 12  annee_construction             int32         
 13  classe_energetique             object        
 14  nb_occupants_moyen             int32         
 15  intensite_energ

Unnamed: 0,batiment_id,timestamp,consommation,unite,hour,year,month,date,nom,type,...,consommation_par_occupant,consommation_par_m2,consommation_journaliere,cout_journalier,cout_mensuel,cout_annuel,consommation_annuelle,IPE,consommation_moyenne_par_type,ecart_conso_moyenne_type
0,BAT0056,2024-01-13 08:00:00,5.23,m3,8,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.038741,0.00335,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.68662
1,BAT0001,2024-01-13 04:00:00,0.22,m3,4,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.000978,0.000114,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.028883
2,BAT0056,2024-01-13 17:00:00,3.42,m3,17,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.025333,0.002191,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.448994
3,BAT0001,2024-01-13 09:00:00,2.3,m3,9,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.010222,0.001194,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.301955
4,BAT0056,2024-01-13 19:00:00,3.68,m3,19,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.027259,0.002357,2.573913,9.652174,27548.8875,315022.9875,84006.13,53.815586,7.617028,0.483128
5,BAT0001,2024-01-13 10:00:00,3.81,m3,10,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.016933,0.001978,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.500195
6,BAT0057,2024-01-13 07:00:00,0.82,m3,7,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.023429,0.000728,0.961304,3.604891,9985.05,115972.275,30925.94,27.465311,2.727374,0.300656
7,BAT0001,2024-01-13 13:00:00,2.68,m3,13,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.011911,0.001391,1.98913,7.459239,21864.15,252612.6,67363.36,34.975784,7.617028,0.351843
8,BAT0057,2024-01-13 11:00:00,1.36,m3,11,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.038857,0.001208,0.961304,3.604891,9985.05,115972.275,30925.94,27.465311,2.727374,0.498648
9,BAT0002,2024-01-13 09:00:00,1.49,m3,9,2024,1,2024-01-13,Ecole Paris 2,ecole,...,0.003706,0.001289,0.859091,3.221591,9279.5625,104863.2375,27963.53,24.189905,7.617028,0.195614


### Partie 3 : Analyse exploratoire (2-3h)

**Competence evaluee : C2.3 - Analyser des donnees structurees pour repondre a un besoin metier**

#### Etape 3.1 : Statistiques descriptives
- Calculer les statistiques par type d'energie, type de batiment et commune


In [3]:
stats_type_energie = df.groupby("type_energie")["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)
stats_type_energie.reset_index(inplace=True)

print(f"  consommation par type_energie")
print(stats_type_energie.head(10))
print()

stats_type = df.groupby(["type", "type_energie"])["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)
stats_type.reset_index(inplace=True)

print(f"  consommation par type et type_energie")
print(stats_type.head(10))
print()

stats_commune = df.groupby(["commune", "type_energie"])["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)
stats_commune.reset_index(inplace=True)

print(f"  consommation par commune et type_energie")
print(stats_type_energie.head(10))
print()

  consommation par type_energie
  type_energie    count    mean     std   min    25%  median     75%      max
0          eau  2497509   42.77   90.31  0.02   1.75    7.52   22.56   661.28
1  electricite  2497512  270.94  425.25  0.23  29.81  108.58  302.70  4231.70
2          gaz  2497563  404.81  637.95  0.35  44.31  160.60  449.13  6348.19

  consommation par type et type_energie
          type type_energie   count    mean     std   min    25%  median  \
0        ecole          eau  461917    7.62    7.46  0.07   1.40    4.96   
1        ecole  electricite  461763  136.71  146.87  0.80  23.13   82.06   
2        ecole          gaz  461865  205.22  220.65  1.21  34.74  123.59   
3      gymnase          eau  461895   18.10   14.43  0.58   3.46   16.14   
4      gymnase  electricite  461841  253.78  226.79  5.03  51.26  206.79   
5      gymnase          gaz  461925  380.92  340.68  7.55  76.99  309.87   
6       mairie          eau  530239    2.73    2.88  0.02   0.47    1.56   
7      

- Identifier les batiments les plus/moins energivores

In [4]:
batiment_conso_annuelle = (
    df
    .groupby(["batiment_id", "nom", "type", "commune", "surface_m2", "annee_construction", "classe_energetique", "nb_occupants_moyen"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
    .reset_index()
)

top5_plus_energivore = batiment_conso_annuelle.nlargest(5, "consommation_annuelle")
top5_moins_energivore = batiment_conso_annuelle.nsmallest(5, "consommation_annuelle")

print(f"  Top 5 des bâtiment les plus énergivores")
print(top5_plus_energivore)

print(f"  Top 5 des bâtiment les moins énergivores")
print(top5_moins_energivore)

  Top 5 des bâtiment les plus énergivores
    batiment_id                        nom     type        commune  \
4       BAT0005            Piscine Paris 5  piscine          Paris   
47      BAT0048           Piscine Lille 48  piscine          Lille   
135     BAT0136         Piscine Toulon 136  piscine         Toulon   
111     BAT0112          Piscine Reims 112  piscine          Reims   
132     BAT0133  Piscine Saint-Etienne 133  piscine  Saint-Etienne   

     surface_m2  annee_construction classe_energetique  nb_occupants_moyen  \
4          3913                1950                  G                 242   
47         3754                1956                  G                 150   
135        3926                1991                  F                 237   
111        3095                1952                  G                 399   
132        3507                2005                  F                 154   

     consommation_annuelle  
4               2724519.45  
47        

- Calculer la repartition des consommations par classe energetique DPE

In [5]:
batiment_classe_energetique_conso_annuelle = (
    df
    .groupby(["batiment_id", "classe_energetique"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
)

repartition_classe_energetique = (
    batiment_classe_energetique_conso_annuelle
    .groupby("classe_energetique")
    .agg(
        count_batiemtn=("consommation_annuelle", "count"),
        mean_consommation_annuelle=("consommation_annuelle", "mean"),
        max_consommation_annuelle=("consommation_annuelle", "max"),
        min_consommation_annuelle=("consommation_annuelle", "min"),
        median_consommation_annuelle=("consommation_annuelle", "median"),
    )
)

repartition_classe_energetique.head(10)

Unnamed: 0_level_0,count_batiemtn,mean_consommation_annuelle,max_consommation_annuelle,min_consommation_annuelle,median_consommation_annuelle
classe_energetique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2,18539.165,26896.82,10181.51,18539.165
B,5,290177.29,687152.46,28949.19,44759.28
C,15,163178.774667,1119317.39,6346.91,27333.07
D,23,343633.92087,1368508.13,7653.55,74406.12
E,24,270458.333333,1773989.33,9533.25,46460.25
F,37,497536.975135,2329137.83,13829.89,93271.69
G,40,419154.352,2724519.45,17796.44,87574.45


- Analyser l'evolution temporelle (tendances mensuelles, saisonnalite)

In [6]:
conso_mensuelle = (
    df
    .groupby(["batiment_id", "year", "month"])
    .agg(consommation_mensuelle=("consommation", "sum"))
    .reset_index()
)

conso_mensuelle.sort_values(["batiment_id", "year", "month"])
delta_conso_mensuelle = conso_mensuelle.groupby(["batiment_id", "year", "month"])['consommation_mensuelle'].agg([('delta_consommation_mensuelle', lambda x: x - x.shift(1, fill_value=0))])

conso_mensuelle = conso_mensuelle.join(delta_conso_mensuelle, ["batiment_id", "year", "month"], "left")

conso_mensuelle.head(5)


Unnamed: 0,batiment_id,year,month,consommation_mensuelle,delta_consommation_mensuelle
0,BAT0001,2023,1,366139.86,366139.86
1,BAT0001,2023,2,333364.91,333364.91
2,BAT0001,2023,3,237490.36,237490.36
3,BAT0001,2023,4,215595.87,215595.87
4,BAT0001,2023,5,239355.44,239355.44


In [7]:
conso_saison = (
    df
    .groupby(["batiment_id", "season"])
    .agg(consommation_saison=("consommation", "sum"))
    .reset_index()
)

conso_saison.sort_values(["batiment_id", "season"])
delta_conso_saison = conso_saison.groupby(["batiment_id", "season"])['consommation_saison'].agg([('delta_consommation_saison', lambda x: x - x.shift(1, fill_value=0))])

conso_saison = conso_saison.join(delta_conso_saison, ["batiment_id", "season"], "left")

conso_saison.head(5)

Unnamed: 0,batiment_id,season,consommation_saison,delta_consommation_saison
0,BAT0001,Automne,1555447.3,1555447.3
1,BAT0001,Ete,932514.8,932514.8
2,BAT0001,Hiver,2063543.71,2063543.71
3,BAT0001,Printemps,1333304.54,1333304.54
4,BAT0002,Automne,646306.18,646306.18


Les résultats obtenus ne sont pas concluants. Je ne parviens pas à utiliser la fonction `shift`.

- Comparer la consommation theorique (selon DPE) vs reelle

Classes | Consommation en kWh/m2.an
:- | :-
A | inférieure à 70 kWh/m2.an
B | entre 71 et 110 kWh/m2.an
C | entre 111 et 180 kWh/m2.an
D | entre 181 et 250 kWh/m2.an
E | entre 251 et 330 kWh/m2.an
F | entre 331 et 420 kWh/m2.an
G | supérieure à 421 kWh/m2.an

In [8]:
def trouver_classe_energetique(ipe: float) -> str:
    if ipe < 70:
        return "A"
    elif ipe < 110:
        return "B"
    elif ipe < 180:
        return "C"
    elif ipe < 250:
        return "D"
    elif ipe < 330:
        return "E"
    elif ipe < 420:
        return "F"
    return "G"

classe_energie = df.loc[df["type_energie"].isin(["gaz", "electricite"])].copy()
classe_energie = (
    classe_energie
    .groupby(["batiment_id", "classe_energetique", "type_energie", "unite"])
    .agg(IPE=("IPE", "first"))
    .reset_index()
)
classe_energie = (
    classe_energie
    .groupby(["batiment_id", "classe_energetique"])
    .agg(IPE=("IPE", "sum"))
    .reset_index()
)
classe_energie["classe_energetique_reelle"] = classe_energie["IPE"].apply(lambda x: trouver_classe_energetique(x))

classe_energie.head(20)


Unnamed: 0,batiment_id,classe_energetique,IPE,classe_energetique_reelle
0,BAT0001,E,1567.996101,G
1,BAT0002,C,1085.332647,G
2,BAT0003,D,1205.882124,G
3,BAT0004,C,1191.847365,G
4,BAT0005,G,7816.001935,G
5,BAT0006,E,1180.977562,G
6,BAT0007,C,1763.649884,G
7,BAT0008,G,3906.424526,G
8,BAT0009,B,847.573068,G
9,BAT0010,G,2664.632129,G


Les résultats obtenus ne sont pas concluants. L'IPE doit être mal calculé.

- Tableaus de synthèse exportés en CSV

In [9]:
TABLEAUX_SYNTHESE_PATH.mkdir(parents=True, exist_ok=True)

stats_type_energie.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_type_energie.csv").as_posix(), index=False)
stats_type.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_type.csv").as_posix(), index=False)
stats_commune.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_commune.csv").as_posix(), index=False)
top5_plus_energivore.to_csv((TABLEAUX_SYNTHESE_PATH / "top5_plus_energivore.csv").as_posix(), index=False)
top5_moins_energivore.to_csv((TABLEAUX_SYNTHESE_PATH / "top5_moins_energivore.csv").as_posix(), index=False)
repartition_classe_energetique.to_csv((TABLEAUX_SYNTHESE_PATH / "repartition_classe_energetique.csv").as_posix(), index=False)
conso_mensuelle.to_csv((TABLEAUX_SYNTHESE_PATH / "tendances_mensuelles.csv").as_posix(), index=False)
conso_saison.to_csv((TABLEAUX_SYNTHESE_PATH / "tendances_saisonnalite.csv").as_posix(), index=False)
classe_energie.to_csv((TABLEAUX_SYNTHESE_PATH / "classe_energetique_relle.csv").as_posix(), index=False)

**Livrables** :
- Notebook `06_statistiques_descriptives.ipynb`
- Tableaux de synthese exportes en CSV