In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("../output").resolve()
TABLEAUX_SYNTHESE_PATH = OUTPUT_DIR / "tableaux_synthese"
CONSOMMATIONS_ENRICHIES_PARQUET_PATH = OUTPUT_DIR / "consommations_enrichies"
CONSOMMATIONS_ENRICHIES_CSV_PATH = OUTPUT_DIR / "consommations_enrichies.csv"

In [2]:
df = pd.read_parquet(CONSOMMATIONS_ENRICHIES_PARQUET_PATH.as_posix())
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = pd.to_datetime(df['date'])
df['date_debut'] = pd.to_datetime(df['date'])
df['date_fin'] = pd.to_datetime(df['date'])
df['type_energie'] = df['type_energie'].astype("str")

df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7492584 entries, 0 to 7492583
Data columns (total 37 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   batiment_id                    object        
 1   timestamp                      datetime64[ns]
 2   consommation                   float64       
 3   unite                          object        
 4   hour                           int32         
 5   year                           int32         
 6   month                          int32         
 7   date                           datetime64[ns]
 8   nom                            object        
 9   type                           object        
 10  commune                        object        
 11  surface_m2                     int32         
 12  annee_construction             int32         
 13  classe_energetique             object        
 14  nb_occupants_moyen             int32         
 15  intensite_energ

Unnamed: 0,batiment_id,timestamp,consommation,unite,hour,year,month,date,nom,type,...,consommation_par_occupant,consommation_par_m2,tarif,cout_journialier,cout_mensuel,cout_annuel,consommation_annuelle,IPE,consommation_moyenne_par_type,ecart_conso_moyenne_type
0,BAT0056,2024-01-13 08:00:00,5.23,m3,8,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.038741,0.00335,19.6125,815.2463,97999.046,898812.3568,3852032.01,2467.669449,116.508725,0.044889
1,BAT0001,2024-01-13 04:00:00,0.22,m3,4,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.000978,0.000114,0.825,598.2608,77480.4948,720625.5751,3087323.85,1602.971885,116.508725,0.001888
2,BAT0056,2024-01-13 17:00:00,3.42,m3,17,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.025333,0.002191,12.825,815.2463,97999.046,898812.3568,3852032.01,2467.669449,116.508725,0.029354
3,BAT0001,2024-01-13 09:00:00,2.3,m3,9,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.010222,0.001194,8.625,598.2608,77480.4948,720625.5751,3087323.85,1602.971885,116.508725,0.019741
4,BAT0056,2024-01-13 19:00:00,3.68,m3,19,2024,1,2024-01-13,Ecole Nantes 56,ecole,...,0.027259,0.002357,13.8,815.2463,97999.046,898812.3568,3852032.01,2467.669449,116.508725,0.031586
5,BAT0001,2024-01-13 10:00:00,3.81,m3,10,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.016933,0.001978,14.2875,598.2608,77480.4948,720625.5751,3087323.85,1602.971885,116.508725,0.032701
6,BAT0057,2024-01-13 07:00:00,0.82,m3,7,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.023429,0.000728,3.075,341.0968,42563.0074,384953.8534,1768493.76,1570.598366,51.915855,0.015795
7,BAT0001,2024-01-13 13:00:00,2.68,m3,13,2024,1,2024-01-13,Ecole Paris 1,ecole,...,0.011911,0.001391,10.05,598.2608,77480.4948,720625.5751,3087323.85,1602.971885,116.508725,0.023003
8,BAT0057,2024-01-13 11:00:00,1.36,m3,11,2024,1,2024-01-13,Mairie Nantes 57,mairie,...,0.038857,0.001208,5.1,341.0968,42563.0074,384953.8534,1768493.76,1570.598366,51.915855,0.026196
9,BAT0002,2024-01-13 09:00:00,1.49,m3,9,2024,1,2024-01-13,Ecole Paris 2,ecole,...,0.003706,0.001289,5.5875,259.8602,32665.7153,299197.8351,1282608.07,1109.522552,116.508725,0.012789


### Partie 3 : Analyse exploratoire (2-3h)

**Competence evaluee : C2.3 - Analyser des donnees structurees pour repondre a un besoin metier**

#### Etape 3.1 : Statistiques descriptives
- Calculer les statistiques par type d'energie, type de batiment et commune


In [3]:
stats_type_energie = df.groupby("type_energie")["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)

print(f"  consommation par type_energie")
print(stats_type_energie.head(10))
print()

stats_type = df.groupby("type")["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)

print(f"  consommation par type")
print(stats_type.head(10))
print()

stats_commune = df.groupby("commune")["consommation"].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)

print(f"  consommation par commune")
print(stats_type_energie.head(10))
print()

  consommation par type_energie
                count    mean     std   min    25%  median     75%      max
type_energie                                                               
eau           2497509   42.77   90.31  0.02   1.75    7.52   22.56   661.28
electricite   2497512  270.94  425.25  0.23  29.81  108.58  302.70  4231.70
gaz           2497563  404.81  637.95  0.35  44.31  160.60  449.13  6348.19

  consommation par type
               count    mean     std   min     25%  median      75%      max
type                                                                        
ecole        1385545  116.51  173.63  0.07    7.88   28.85   168.44  1469.64
gymnase      1385661  217.60  280.18  0.58   20.77   63.42   344.07  1976.89
mairie       1590724   51.92   82.08  0.02    2.62   12.00    68.14   704.57
mediatheque  1539549   91.79  125.72  0.15    6.70   27.69   139.23  1078.35
piscine      1591105  696.18  803.50  5.27  120.75  336.62  1059.09  6348.19

  consommation par comm

- Identifier les batiments les plus/moins energivores

In [10]:
batiment_conso_annuelle = (
    df
    .groupby(["batiment_id","nom","type","commune","surface_m2","annee_construction","classe_energetique","nb_occupants_moyen"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
    .reset_index()
)

top5_plus_energivore = batiment_conso_annuelle.nlargest(5, "consommation_annuelle")
top5_moins_energivore = batiment_conso_annuelle.nsmallest(5, "consommation_annuelle")

print(f"  Top 5 des bâtiment les plus énergivores")
print(top5_plus_energivore)

print(f"  Top 5 des bâtiment les moins énergivores")
print(top5_moins_energivore)

  Top 5 des bâtiment les plus énergivores
    batiment_id                        nom     type        commune  \
4       BAT0005            Piscine Paris 5  piscine          Paris   
47      BAT0048           Piscine Lille 48  piscine          Lille   
135     BAT0136         Piscine Toulon 136  piscine         Toulon   
111     BAT0112          Piscine Reims 112  piscine          Reims   
132     BAT0133  Piscine Saint-Etienne 133  piscine  Saint-Etienne   

     surface_m2  annee_construction classe_energetique  nb_occupants_moyen  \
4          3913                1950                  G                 242   
47         3754                1956                  G                 150   
135        3926                1991                  F                 237   
111        3095                1952                  G                 399   
132        3507                2005                  F                 154   

     consommation_annuelle  
4              33308535.02  
47        

- Calculer la repartition des consommations par classe energetique DPE

In [5]:
batiment_classe_energetique_conso_annuelle = (
    df
    .groupby(["batiment_id", "classe_energetique"])
    .agg(consommation_annuelle=("consommation_annuelle", "first"))
)

repartition_classe_energetique = (
    batiment_classe_energetique_conso_annuelle
    .groupby("classe_energetique")
    .agg(
        count_batiemtn=("consommation_annuelle", "count"),
        sum_consommation_annuelle=("consommation_annuelle", "sum"),
        max_consommation_annuelle=("consommation_annuelle", "max"),
        min_consommation_annuelle=("consommation_annuelle", "min"),
        median_consommation_annuelle=("consommation_annuelle", "median"),
    )
)

repartition_classe_energetique.head(10)

Unnamed: 0_level_0,count_batiemtn,sum_consommation_annuelle,max_consommation_annuelle,min_consommation_annuelle,median_consommation_annuelle
classe_energetique,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2,1895209.0,1314153.8,581055.19,947604.495
B,5,21823890.0,8414678.25,1404082.05,2050305.58
C,15,41300800.0,13660178.09,365255.72,1262928.4
D,23,121629800.0,16760654.7,437765.7,2678475.4
E,24,109556900.0,21748271.11,546584.11,2259290.44
F,37,298509900.0,28435077.85,797153.68,4289944.14
G,40,304347500.0,33308535.02,1012750.47,4016117.555


- Analyser l'evolution temporelle (tendances mensuelles, saisonnalite)

In [6]:
conso_mensuelle = (
    df
    .groupby(["batiment_id", "year", "month"])
    .agg(consommation_mensuelle=("consommation", "sum"))
    .reset_index()
)

conso_mensuelle.sort_values(["batiment_id", "year", "month"])
delta_conso_mensuelle = conso_mensuelle.groupby(["batiment_id", "year", "month"])['consommation_mensuelle'].agg([('delta_consommation_mensuelle', lambda x: x - x.shift(1, fill_value=0))])

conso_mensuelle = conso_mensuelle.join(delta_conso_mensuelle, ["batiment_id", "year", "month"], "left")

conso_mensuelle.head(5)


Unnamed: 0,batiment_id,year,month,consommation_mensuelle,delta_consommation_mensuelle
0,BAT0001,2023,1,366139.86,366139.86
1,BAT0001,2023,2,333364.91,333364.91
2,BAT0001,2023,3,237490.36,237490.36
3,BAT0001,2023,4,215595.87,215595.87
4,BAT0001,2023,5,239355.44,239355.44


In [7]:
conso_saison = (
    df
    .groupby(["batiment_id", "season"])
    .agg(consommation_saison=("consommation", "sum"))
    .reset_index()
)

conso_saison.sort_values(["batiment_id", "season"])
delta_conso_saison = conso_saison.groupby(["batiment_id", "season"])['consommation_saison'].agg([('delta_consommation_saison', lambda x: x - x.shift(1, fill_value=0))])

conso_saison = conso_saison.join(delta_conso_saison, ["batiment_id", "season"], "left")

conso_saison.head(5)

Unnamed: 0,batiment_id,season,consommation_saison,delta_consommation_saison
0,BAT0001,Automne,1555447.3,1555447.3
1,BAT0001,Ete,932514.8,932514.8
2,BAT0001,Hiver,2063543.71,2063543.71
3,BAT0001,Printemps,1333304.54,1333304.54
4,BAT0002,Automne,646306.18,646306.18


Les résultats obtenus ne sont pas concluants. Je ne parviens pas à utiliser la fonction `shift`.

- Comparer la consommation theorique (selon DPE) vs reelle

Classes | Consommation en kWh/m2.an
:- | :-
A | inférieure à 70 kWh/m2.an
B | entre 71 et 110 kWh/m2.an
C | entre 111 et 180 kWh/m2.an
D | entre 181 et 250 kWh/m2.an
E | entre 251 et 330 kWh/m2.an
F | entre 331 et 420 kWh/m2.an
G | supérieure à 421 kWh/m2.an

In [8]:
def trouver_classe_energetique(ipe: float) -> str:
    if ipe < 70:
        return "A"
    elif ipe < 110:
        return "B"
    elif ipe < 180:
        return "C"
    elif ipe < 250:
        return "D"
    elif ipe < 330:
        return "E"
    elif ipe < 420:
        return "F"
    return "G"

df["classe_energetique_reelle"] = df["IPE"].apply(lambda x: trouver_classe_energetique(x))

classe_energie_reelle = (
    df
    .groupby("batiment_id")
    .agg(
        classe_energetique=("classe_energetique", "first"),
        classe_energetique_reelle=("classe_energetique_reelle", "first"),
    )
    .reset_index()
)

classe_energie_reelle.head(5)


Unnamed: 0,batiment_id,classe_energetique,classe_energetique_reelle
0,BAT0001,E,G
1,BAT0002,C,G
2,BAT0003,D,G
3,BAT0004,C,G
4,BAT0005,G,G


Les résultats obtenus ne sont pas concluants. Il aurait fallu séparer les types énergétiques.

- Tableaus de synthèse exportés en CSV

In [11]:
TABLEAUX_SYNTHESE_PATH.mkdir(parents=True, exist_ok=True)

stats_type_energie.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_type_energie.csv").as_posix(), index=False)
stats_type.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_type.csv").as_posix(), index=False)
stats_commune.to_csv((TABLEAUX_SYNTHESE_PATH / "stats_commune.csv").as_posix(), index=False)
top5_plus_energivore.to_csv((TABLEAUX_SYNTHESE_PATH / "top5_plus_energivore.csv").as_posix(), index=False)
top5_moins_energivore.to_csv((TABLEAUX_SYNTHESE_PATH / "top5_moins_energivore.csv").as_posix(), index=False)
repartition_classe_energetique.to_csv((TABLEAUX_SYNTHESE_PATH / "repartition_classe_energetique.csv").as_posix(), index=False)
conso_mensuelle.to_csv((TABLEAUX_SYNTHESE_PATH / "tendances_mensuelles.csv").as_posix(), index=False)
conso_saison.to_csv((TABLEAUX_SYNTHESE_PATH / "tendances_saisonnalite.csv").as_posix(), index=False)
classe_energie_reelle.to_csv((TABLEAUX_SYNTHESE_PATH / "classe_energetique_relle.csv").as_posix(), index=False)

**Livrables** :
- Notebook `06_statistiques_descriptives.ipynb`
- Tableaux de synthese exportes en CSV