# Projet de MLOps
## Prédiction de la consommation anuelle d'électricité 

Auteurs: Lilou Masson, Paul Hamann Cossart

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(
    'DATA/consommation-quotidienne-brute.csv',
    sep = ';'
)

In [3]:
df.head()

Unnamed: 0,Date - Heure,Date,Heure,Consommation brute gaz (MW PCS 0°C) - NaTran,Statut - NaTran,Consommation brute gaz (MW PCS 0°C) - Teréga,Statut - Teréga,Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE,Statut - RTE,Consommation brute totale (MW),flag_ignore
0,2025-12-31T22:00:00+00:00,31/12/2025,23:00,62510.0,Définitif,4352.0,Définitif,66862.0,,,,non
1,2025-12-31T21:00:00+00:00,31/12/2025,22:00,66697.0,Définitif,4782.0,Définitif,71479.0,,,,non
2,2025-12-31T20:00:00+00:00,31/12/2025,21:00,73103.0,Définitif,5113.0,Définitif,78216.0,,,,non
3,2025-12-31T19:00:00+00:00,31/12/2025,20:00,77996.0,Définitif,5271.0,Définitif,83267.0,,,,non
4,2025-12-31T18:00:00+00:00,31/12/2025,19:00,80381.0,Définitif,5495.0,Définitif,85876.0,,,,non


Consommation brute gaz totale : données à l'heure
Consommation brute électrique totale : données à la demi-heure

Conso brute totale n'a pas de sens (addition d'un volume à l'heure et d'un volume à la demi-heure)  
on garde elec et gaz separement  

Pas de données elec pour décembre 2025  

Sinon, pas d'autres valeurs manquantes

In [None]:
def normalize_columns(columns):
    """
    Normalizes column names by converting to lowercase, 
    removing parentheses and accents, and shortening words.

    Parameters:
    - columns (iterable): An iterable of column names to be normalized.

    Returns:
    - list: A list of normalized column names.
    """
    new_cols = []

    for col in columns:
        col = col.lower()
        col = re.sub(r"\(.*?\)", "", col)
        col = unicodedata.normalize("NFKD", col)
        col = col.encode("ascii", "ignore").decode("utf-8")
        col = re.sub(r"[^a-z0-9]+", " ", col)
        words = col.split()
        short_words = [w[:3] for w in words]
        new_cols.append("_".join(short_words))
        
    return new_cols


def columns_selection(df):
    """
    Selects specific columns from the DataFrame based on their index positions.

    Parameters:
    - df (pd.DataFrame): The input DataFrame from which to select columns.

    Returns:
    - pd.DataFrame: A DataFrame containing only the selected columns.
    """
    cols_to_keep = [1, 2, 7, 8]

    return df.iloc[:, cols_to_keep]


def data_cleaning(df):
    """
    Cleans the input DataFrame by normalizing column names, selecting specific columns,
    and converting date and time columns to appropriate formats.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to be cleaned.

    Returns:
    - pd.DataFrame: A cleaned DataFrame with normalized column names, selected columns,
      and properly formatted date and time columns.
    """
    df.columns = normalize_columns(df.columns)
    df = columns_selection(df).copy()
    
    df['timestamp'] = pd.to_datetime(df['dat'] + ' ' + df['heu'], format='%d/%m/%Y %H:%M')
    df['dat'] = pd.to_datetime(df['dat'], format='%d/%m/%Y').dt.date
    df['heu'] = pd.to_datetime(df['heu'], format='%H:%M').dt.time
    df = df.set_index('timestamp').sort_index()

    df = df.loc[df['con_bru_ele_rte'].notna()]

    return df

def create_dfs(df):
    df_gaz = df.drop(columns=['con_bru_ele_rte'])
    df_gaz = df_gaz.loc[df_gaz['con_bru_gaz_rte'].notna()]
    df_gaz = df.drop(columns=['con_bru_gaz_tot'])
    df_gaz = df_gaz.loc[df_gaz['con_bru_gaz_tot'].notna()]

df = data_cleaning(df)
df



IndexError: positional indexers are out-of-bounds

In [13]:
df = df.loc[df['con_bru_ele_rte'].notna()]
df

Unnamed: 0_level_0,dat,heu,con_bru_gaz_tot,con_bru_ele_rte,con_bru_tot
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01 00:00:00,2012-01-01,00:00:00,55353.0,59610.0,114963.0
2012-01-01 00:30:00,2012-01-01,00:30:00,55398.5,58314.0,113712.5
2012-01-01 01:00:00,2012-01-01,01:00:00,55444.0,56230.0,111674.0
2012-01-01 01:30:00,2012-01-01,01:30:00,55454.5,56075.0,111529.5
2012-01-01 02:00:00,2012-01-01,02:00:00,55465.0,55531.0,110996.0
...,...,...,...,...,...
2025-11-30 21:30:00,2025-11-30,21:30:00,63371.5,59074.0,122445.5
2025-11-30 22:00:00,2025-11-30,22:00:00,61165.0,59505.0,120670.0
2025-11-30 22:30:00,2025-11-30,22:30:00,58035.5,60654.0,118689.5
2025-11-30 23:00:00,2025-11-30,23:00:00,54906.0,59992.0,114898.0


In [46]:
df.columns = normalize_columns(df.columns)
df

Unnamed: 0,dat_heu,dat,heu,con_bru_gaz_nat,sta_nat,con_bru_gaz_ter,sta_ter,con_bru_gaz_tot,con_bru_ele_rte,sta_rte,con_bru_tot,fla_ign
0,2025-12-31T22:00:00+00:00,31/12/2025,23:00,62510.0,Définitif,4352.0,Définitif,66862.0,,,,non
1,2025-12-31T21:00:00+00:00,31/12/2025,22:00,66697.0,Définitif,4782.0,Définitif,71479.0,,,,non
2,2025-12-31T20:00:00+00:00,31/12/2025,21:00,73103.0,Définitif,5113.0,Définitif,78216.0,,,,non
3,2025-12-31T19:00:00+00:00,31/12/2025,20:00,77996.0,Définitif,5271.0,Définitif,83267.0,,,,non
4,2025-12-31T18:00:00+00:00,31/12/2025,19:00,80381.0,Définitif,5495.0,Définitif,85876.0,,,,non
...,...,...,...,...,...,...,...,...,...,...,...,...
244723,2012-01-01T01:00:00+00:00,01/01/2012,02:00,52251.0,Définitif,3214.0,Définitif,55465.0,55531.0,Définitif,110996.0,non
244724,2012-01-01T00:30:00+00:00,01/01/2012,01:30,,,,,,56075.0,Définitif,,non
244725,2012-01-01T00:00:00+00:00,01/01/2012,01:00,52236.0,Définitif,3208.0,Définitif,55444.0,56230.0,Définitif,111674.0,non
244726,2011-12-31T23:30:00+00:00,01/01/2012,00:30,,,,,,58314.0,Définitif,,non
