# Quick check

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

In [2]:
df = pd.read_csv("final_df.csv", parse_dates = ["Date"], low_memory = False )

In [3]:
# Total date range

print("Total covered period : from {} to {}".format(df['Date'].min(), df['Date'].max()))

Total covered period : from 1990-10-27 00:00:00 to 2030-12-25 00:00:00


In [4]:
# Percentage of missing values by column

n = len(df)
missing_pct = df.isna().sum() / n * 100

print(f"\nPercentage of missing values by column : {missing_pct}")


Percentage of missing values by column : Périmètre                                0.736114
Nature                                   0.736114
Date                                     0.000000
Heures                                   0.736114
Consommation                            42.048294
Prévision J-1                            0.756284
Prévision J                              0.776879
Fioul                                   42.048294
Charbon                                 42.048294
Gaz                                     42.048294
Nucléaire                               42.048294
Eolien                                  42.048294
Solaire                                 42.048294
Hydraulique                             42.048294
Pompage                                 42.048294
Bioénergies                             42.048294
Ech. physiques                          42.048294
Taux de Co2                             42.048294
Ech. comm. Angleterre                   47.393877
Ech. com

In [5]:
# Consumption period

min_date, max_date = (df.dropna(subset=['Consommation'])['Date'].min(),
                     df.dropna(subset=['Consommation'])['Date'].max())

print(f"Covered consumption period : from {min_date} to {max_date}")

Covered consumption period : from 2012-01-01 00:00:00 to 2025-04-29 00:00:00


In [6]:
# Percentage of missing values by column on the consumption period (divided into quarter hours)

min_date, max_date = (
    df.dropna(subset=['Consommation'])['Date'].min(),
    df.dropna(subset=['Consommation'])['Date'].max()
)

mask = (df['Date'] >= min_date) & (df['Date'] <= max_date)
df_cons = df.loc[mask]

n = len(df_cons)
missing_pct = df_cons.isna().sum() / n * 100

print(f"\nPercentage of missing values by column during the consumption period : {missing_pct}")


Percentage of missing values by column during the consumption period : Périmètre                                0.000000
Nature                                   0.000000
Date                                     0.000000
Heures                                   0.000000
Consommation                            41.594555
Prévision J-1                            0.000000
Prévision J                              0.000000
Fioul                                   41.594555
Charbon                                 41.594555
Gaz                                     41.594555
Nucléaire                               41.594555
Eolien                                  41.594555
Solaire                                 41.594555
Hydraulique                             41.594555
Pompage                                 41.594555
Bioénergies                             41.594555
Ech. physiques                          41.594555
Taux de Co2                             41.594555
Ech. comm. Angleterre       

In [7]:
# Percentage of quarter in a day with no consumption data

min_date, max_date = (
    df.dropna(subset=['Consommation'])['Date'].min(),
    df.dropna(subset=['Consommation'])['Date'].max()
)

mask = (df['Date'] >= min_date) & (df['Date'] <= max_date)
df_cons = df.loc[mask].copy()
df_cons['Jour'] = df_cons['Date'].dt.date

daily_count = (
    df_cons.groupby('Jour')['Consommation']
      .apply(lambda x: x.notna().sum())
)

daily_pct = daily_count / 96 * 100

summary = pd.DataFrame({
    'Percentage of quarters with no consumption datas': daily_pct
})

print(summary.head())

            Percentage of quarters with no consumption datas
Jour                                                        
2012-01-01                                              50.0
2012-01-02                                              50.0
2012-01-03                                              50.0
2012-01-04                                              50.0
2012-01-05                                              50.0


In [8]:
# Percentage of days with no consumption data

min_date, max_date = (
    df.dropna(subset=['Consommation'])['Date'].min(),
    df.dropna(subset=['Consommation'])['Date'].max()
)

mask = (df['Date'] >= min_date) & (df['Date'] <= max_date)
df_cons = df.loc[mask].copy()
df_cons['Jour'] = df_cons['Date'].dt.date

valid_counts = df_cons.groupby('Jour')['Consommation'].apply(lambda x: x.notna().sum())

total_days       = valid_counts.shape[0]
days_no_data     = (valid_counts == 0).sum()
pct_no_data = days_no_data / total_days * 100

print(f"Percentage of days with no consumption data : {pct_no_data}%")

Percentage of days with no consumption data : 0.0%


In [9]:
# Duplicates

dups = df.duplicated().sum()
print(f"\nNombre de lignes dupliquées : {dups}")


Nombre de lignes dupliquées : 0


In [10]:
# Consumption outliers

cons = df['Consommation'].dropna()
q1, q3 = cons.quantile([.05, .95])
print(f"5e percentile : {q1}, 95e percentile : {q3}")

5e percentile : 36003.0, 95e percentile : 74137.0
