# Etape 4 - Analyse exploratoire et statistiques

**Objectif** : Comprendre les patterns de pollution

In [15]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

OUTPUT_DIR = "../output"

## 4.1 Chargement des donnees nettoyees

In [16]:
# Charger le dataset fusionne
df = pd.read_csv(f"{OUTPUT_DIR}/pollution_meteo_clean.csv", parse_dates=['datetime_hour'])

print(f"Dataset charge: {len(df):,} lignes x {len(df.columns)} colonnes")
df.head()

Dataset charge: 697,948 lignes x 21 colonnes


Unnamed: 0,station_id,pollutant,unit,hour,year,month,value_mean,value_min,value_max,measurement_count,station_name,city,station_type,date,datetime_hour,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition,season
0,ST0001,CO,mg/m3,1,2024,1,0.38,0.38,0.38,1,Paris-urbaine-1,Paris,urbaine,2024-01-01,2024-01-01 01:00:00,-1.0,67.1,48.9,9.1,pluvieux,Hiver
1,ST0002,SO2,ug/m3,5,2024,1,3.75,3.75,3.75,1,Paris-periurbaine-2,Paris,periurbaine,2024-01-01,2024-01-01 05:00:00,,,,,,
2,ST0003,SO2,ug/m3,22,2024,1,9.44,9.44,9.44,1,Paris-industrielle-3,Paris,industrielle,2024-01-01,2024-01-01 22:00:00,3.4,61.7,46.2,0.0,orageux,Hiver
3,ST0004,PM10,ug/m3,7,2024,1,58.05,58.05,58.05,1,Lyon-urbaine-1,Lyon,urbaine,2024-01-01,2024-01-01 07:00:00,0.8,43.0,18.7,0.0,pluvieux,Hiver
4,ST0004,SO2,ug/m3,10,2024,1,5.5,5.5,5.5,1,Lyon-urbaine-1,Lyon,urbaine,2024-01-01,2024-01-01 10:00:00,2.3,58.8,21.7,3.0,pluvieux,Hiver


In [17]:
# Ajouter des colonnes temporelles si manquantes
if 'date' not in df.columns:
    df['date'] = df['datetime_hour'].dt.date
if 'hour' not in df.columns:
    df['hour'] = df['datetime_hour'].dt.hour
if 'day_of_week' not in df.columns:
    df['day_of_week'] = df['datetime_hour'].dt.dayofweek
if 'month' not in df.columns:
    df['month'] = df['datetime_hour'].dt.month
if 'season' not in df.columns:
    df['season'] = df['month'].map({
        12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
        3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
        6: 'Ete', 7: 'Ete', 8: 'Ete',
        9: 'Automne', 10: 'Automne', 11: 'Automne'
    })

# Noms des jours
day_names = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
df['day_name'] = df['day_of_week'].map(lambda x: day_names[x])

## 4.2 Statistiques descriptives par polluant et par ville

In [18]:
# Determiner la colonne de valeur
value_col = 'value_mean' if 'value_mean' in df.columns else 'value'

# Statistiques par polluant
stats_pollutant = df.groupby('pollutant')[value_col].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)

print("Statistiques par polluant:")
stats_pollutant

Statistiques par polluant:


Unnamed: 0_level_0,count,mean,std,min,25%,median,75%,max
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CO,116323,0.83,0.42,0.11,0.51,0.76,1.07,2.73
NO2,116270,49.76,24.85,6.33,30.75,45.56,64.14,163.69
O3,116308,82.88,41.54,10.54,51.22,75.74,106.89,272.85
PM10,116337,41.38,20.66,5.25,25.61,37.83,53.32,136.49
PM2.5,116229,24.92,12.47,3.15,15.41,22.78,32.15,81.87
SO2,116481,8.28,4.14,1.05,5.12,7.55,10.71,27.3


In [19]:
# Statistiques par ville
stats_city = df.groupby('city')[value_col].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('max', 'max')
]).round(2).sort_values('mean', ascending=False)

print("\nStatistiques par ville (tous polluants):")
stats_city


Statistiques par ville (tous polluants):


Unnamed: 0_level_0,count,mean,std,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Strasbourg,59394,37.97,37.74,271.42
Nantes,59590,37.96,37.9,271.69
Lyon,44565,34.84,34.27,236.54
Bordeaux,44537,34.8,34.02,236.1
Paris,44341,34.73,34.0,236.12
Toulouse,74261,34.21,35.74,272.24
Lille,74216,34.18,35.79,272.85
Grenoble,88938,33.68,34.76,272.63
Marseille,104027,33.53,34.36,272.74
Rouen,104079,33.4,34.3,272.54


In [20]:
# Tableau croise polluant x ville
pivot_city_pollutant = df.pivot_table(
    values=value_col,
    index='city',
    columns='pollutant',
    aggfunc='mean'
).round(2)

print("\nConcentrations moyennes par ville et polluant:")
pivot_city_pollutant


Concentrations moyennes par ville et polluant:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bordeaux,0.83,49.91,82.64,41.5,25.07,8.32
Grenoble,0.81,48.33,80.73,40.32,24.13,8.03
Lille,0.82,49.05,81.93,40.82,24.59,8.17
Lyon,0.83,50.03,83.38,41.46,24.99,8.35
Marseille,0.8,47.88,80.29,39.99,24.07,7.98
Nantes,0.91,54.43,90.57,45.17,27.35,9.08
Paris,0.83,50.21,82.65,41.47,25.09,8.37
Rouen,0.8,48.08,79.9,39.99,23.97,7.96
Strasbourg,0.91,54.76,90.81,45.22,27.27,9.1
Toulouse,0.82,48.87,81.66,40.84,24.62,8.15


## 4.3 Identification des depassements de seuils reglementaires

In [21]:
# Seuils reglementaires
SEUILS_INFO = {
    'PM2.5': 25,
    'PM10': 50,
    'NO2': 200,
    'O3': 180,
    'SO2': 300,
    'CO': 10  # mg/m3
}

SEUILS_ALERTE = {
    'PM2.5': 50,
    'PM10': 80,
    'NO2': 400,
    'O3': 240,
    'SO2': 500,
    'CO': 20
}

# Ajouter les colonnes de depassement
df['seuil_info'] = df.apply(
    lambda row: SEUILS_INFO.get(row['pollutant'], 9999),
    axis=1
)
df['seuil_alerte'] = df.apply(
    lambda row: SEUILS_ALERTE.get(row['pollutant'], 9999),
    axis=1
)

df['depassement_info'] = df[value_col] > df['seuil_info']
df['depassement_alerte'] = df[value_col] > df['seuil_alerte']

In [22]:
# Comptage des depassements par polluant
depassements = df.groupby('pollutant').agg({
    'depassement_info': 'sum',
    'depassement_alerte': 'sum',
    value_col: 'count'
}).rename(columns={value_col: 'total_mesures'})

depassements['pct_info'] = (depassements['depassement_info'] / depassements['total_mesures'] * 100).round(2)
depassements['pct_alerte'] = (depassements['depassement_alerte'] / depassements['total_mesures'] * 100).round(2)

print("Depassements de seuils par polluant:")
depassements

Depassements de seuils par polluant:


Unnamed: 0_level_0,depassement_info,depassement_alerte,total_mesures,pct_info,pct_alerte
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CO,0,0,116323,0.0,0.0
NO2,0,0,116270,0.0,0.0
O3,2970,296,116308,2.55,0.25
PM10,33219,6430,116337,28.55,5.53
PM2.5,50233,5275,116229,43.22,4.54
SO2,0,0,116481,0.0,0.0


In [23]:
# Jours avec depassement du seuil d'alerte
jours_alerte = df[df['depassement_alerte']].groupby(['date', 'city', 'pollutant'])[value_col].max().reset_index()
jours_alerte = jours_alerte.sort_values(value_col, ascending=False)

print(f"\nNombre de cas de depassement du seuil d'alerte: {len(jours_alerte)}")
print("\nTop 20 des depassements les plus graves:")
jours_alerte.head(20)


Nombre de cas de depassement du seuil d'alerte: 2313

Top 20 des depassements les plus graves:


Unnamed: 0,date,city,pollutant,value_mean
25,2024-01-02,Lille,O3,272.85
834,2024-02-05,Marseille,O3,272.74
1212,2024-02-21,Lille,O3,272.73
597,2024-01-26,Grenoble,O3,272.63
465,2024-01-20,Rouen,O3,272.54
588,2024-01-25,Rouen,O3,272.5
753,2024-02-01,Toulouse,O3,272.24
1045,2024-02-14,Lille,O3,272.2
324,2024-01-14,Rouen,O3,271.77
175,2024-01-08,Lille,O3,271.76


## 4.4 Matrice de correlation polluants / meteo

In [24]:
# Pivoter les donnees pour avoir un polluant par colonne
df_pivot = df.pivot_table(
    values=value_col,
    index=['datetime_hour', 'city'],
    columns='pollutant',
    aggfunc='mean'
).reset_index()

# Joindre les donnees meteo
meteo_cols = ['temperature_c', 'humidity_pct', 'wind_speed_kmh', 'precipitation_mm']
df_meteo_unique = df[['datetime_hour', 'city'] + meteo_cols].drop_duplicates()

df_corr = df_pivot.merge(df_meteo_unique, on=['datetime_hour', 'city'], how='left')

print(f"Dataset pour correlation: {len(df_corr):,} lignes")
df_corr.head()

Dataset pour correlation: 34,560 lignes


Unnamed: 0,datetime_hour,city,CO,NO2,O3,PM10,PM2.5,SO2,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm
0,2024-01-01,Bordeaux,0.843333,91.82,86.19,66.105,21.47,6.81,11.1,64.7,1.0,4.1
1,2024-01-01,Grenoble,0.86,64.433333,109.796,44.6975,34.385,8.9325,10.7,77.4,37.4,0.0
2,2024-01-01,Lille,0.903333,42.99,86.35,37.703333,33.015,8.7625,4.9,52.7,14.1,0.0
3,2024-01-01,Lyon,0.776667,25.75,131.135,46.98,27.225,7.64,-1.3,51.9,5.1,0.0
4,2024-01-01,Marseille,1.118333,42.22,82.798571,32.17,25.65,9.648333,,,,


In [25]:
# Colonnes pour la correlation
pollutants = ['PM2.5', 'PM10', 'NO2', 'O3', 'SO2', 'CO']
available_pollutants = [p for p in pollutants if p in df_corr.columns]
available_meteo = [m for m in meteo_cols if m in df_corr.columns]

corr_cols = available_pollutants + available_meteo

# Calculer la matrice de correlation
correlation_matrix = df_corr[corr_cols].corr().round(3)

print("Matrice de correlation:")
correlation_matrix

Matrice de correlation:


Unnamed: 0,PM2.5,PM10,NO2,O3,SO2,CO,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm
PM2.5,1.0,0.522,0.528,0.52,0.521,0.522,-0.362,0.001,-0.005,-0.006
PM10,0.522,1.0,0.523,0.525,0.517,0.519,-0.363,0.007,-0.002,0.001
NO2,0.528,0.523,1.0,0.524,0.526,0.528,-0.366,0.003,-0.001,-0.005
O3,0.52,0.525,0.524,1.0,0.522,0.528,-0.36,0.004,0.0,0.002
SO2,0.521,0.517,0.526,0.522,1.0,0.519,-0.36,0.007,-0.003,-0.001
CO,0.522,0.519,0.528,0.528,0.519,1.0,-0.359,0.007,-0.002,0.004
temperature_c,-0.362,-0.363,-0.366,-0.36,-0.36,-0.359,1.0,-0.007,-0.003,0.002
humidity_pct,0.001,0.007,0.003,0.004,0.007,0.007,-0.007,1.0,0.001,0.007
wind_speed_kmh,-0.005,-0.002,-0.001,0.0,-0.003,-0.002,-0.003,0.001,1.0,-0.006
precipitation_mm,-0.006,0.001,-0.005,0.002,-0.001,0.004,0.002,0.007,-0.006,1.0


In [26]:
# Correlations significatives polluants/meteo
print("\nCorrelations polluants vs meteo:")
for pollutant in available_pollutants:
    print(f"\n{pollutant}:")
    for meteo in available_meteo:
        corr = correlation_matrix.loc[pollutant, meteo]
        strength = "forte" if abs(corr) > 0.5 else "moderee" if abs(corr) > 0.3 else "faible"
        print(f"  vs {meteo}: {corr:+.3f} ({strength})")


Correlations polluants vs meteo:

PM2.5:
  vs temperature_c: -0.362 (moderee)
  vs humidity_pct: +0.001 (faible)
  vs wind_speed_kmh: -0.005 (faible)
  vs precipitation_mm: -0.006 (faible)

PM10:
  vs temperature_c: -0.363 (moderee)
  vs humidity_pct: +0.007 (faible)
  vs wind_speed_kmh: -0.002 (faible)
  vs precipitation_mm: +0.001 (faible)

NO2:
  vs temperature_c: -0.366 (moderee)
  vs humidity_pct: +0.003 (faible)
  vs wind_speed_kmh: -0.001 (faible)
  vs precipitation_mm: -0.005 (faible)

O3:
  vs temperature_c: -0.360 (moderee)
  vs humidity_pct: +0.004 (faible)
  vs wind_speed_kmh: +0.000 (faible)
  vs precipitation_mm: +0.002 (faible)

SO2:
  vs temperature_c: -0.360 (moderee)
  vs humidity_pct: +0.007 (faible)
  vs wind_speed_kmh: -0.003 (faible)
  vs precipitation_mm: -0.001 (faible)

CO:
  vs temperature_c: -0.359 (moderee)
  vs humidity_pct: +0.007 (faible)
  vs wind_speed_kmh: -0.002 (faible)
  vs precipitation_mm: +0.004 (faible)


## 4.5 Analyse de la saisonnalite

In [27]:
# Pollution par mois
pollution_par_mois = df.groupby(['month', 'pollutant'])[value_col].mean().unstack()
pollution_par_mois.index = ['Jan', 'Fev', 'Mar', 'Avr', 'Mai', 'Juin', 'Juil', 'Aout', 'Sep', 'Oct', 'Nov', 'Dec'][:len(pollution_par_mois)]

print("Concentration moyenne par mois:")
pollution_par_mois.round(2)

Concentration moyenne par mois:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
Jan,0.99,59.57,99.4,49.64,29.94,9.95
Fev,1.0,59.92,99.59,49.57,29.96,9.9
Mar,0.71,42.64,70.95,35.46,21.28,7.12
Avr,0.71,42.71,71.04,35.54,21.36,7.08
Mai,0.71,42.5,70.84,35.41,21.34,7.12


In [28]:
# Pollution par jour de la semaine
pollution_par_jour = df.groupby(['day_of_week', 'pollutant'])[value_col].mean().unstack()
pollution_par_jour.index = day_names

print("\nConcentration moyenne par jour de la semaine:")
pollution_par_jour.round(2)


Concentration moyenne par jour de la semaine:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
Lundi,0.83,50.0,83.46,41.58,25.04,8.36
Mardi,0.83,49.93,83.59,41.76,25.15,8.29
Mercredi,0.83,49.77,82.7,41.53,24.94,8.3
Jeudi,0.83,50.11,83.38,41.59,24.93,8.31
Vendredi,0.83,49.73,81.97,41.34,24.83,8.22
Samedi,0.83,49.0,82.07,41.12,24.82,8.3
Dimanche,0.83,49.69,82.9,40.71,24.68,8.18


In [29]:
# Pollution par heure
pollution_par_heure = df.groupby(['hour', 'pollutant'])[value_col].mean().unstack()

print("\nConcentration moyenne par heure:")
pollution_par_heure.round(2)


Concentration moyenne par heure:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.79,47.61,79.83,40.08,24.18,8.0
1,0.8,47.99,79.57,39.97,24.14,7.88
2,0.56,33.74,55.86,27.87,16.74,5.6
3,0.56,33.72,55.76,27.97,16.88,5.63
4,0.56,33.38,55.3,28.1,16.68,5.64
5,0.79,47.77,80.0,40.3,24.02,7.91
6,0.8,48.05,80.12,40.12,23.97,7.95
7,1.05,62.46,104.53,51.6,31.3,10.32
8,1.04,61.55,103.78,51.63,31.23,10.34
9,1.04,62.74,103.97,51.96,31.34,10.35


In [30]:
# Pollution par saison
pollution_par_saison = df.groupby(['season', 'pollutant'])[value_col].mean().unstack()
# Reordonner les saisons
season_order = ['Hiver', 'Printemps', 'Ete', 'Automne']
pollution_par_saison = pollution_par_saison.reindex([s for s in season_order if s in pollution_par_saison.index])

print("\nConcentration moyenne par saison:")
pollution_par_saison.round(2)


Concentration moyenne par saison:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hiver,1.0,59.75,99.44,49.6,29.95,9.93
Printemps,0.71,42.66,70.96,35.5,21.34,7.11


## 4.6 Top 10 des journees les plus polluees

In [31]:
# Moyenne journaliere par polluant
df_daily = df.groupby(['date', 'pollutant', 'city'])[value_col].mean().reset_index()

# Top 10 pour chaque polluant
for pollutant in available_pollutants:
    top10 = df_daily[df_daily['pollutant'] == pollutant].nlargest(10, value_col)
    print(f"\nTop 10 journees les plus polluees - {pollutant}:")
    print(top10.to_string(index=False))


Top 10 journees les plus polluees - PM2.5:
      date pollutant       city  value_mean
2024-01-24     PM2.5     Nantes   35.928824
2024-02-27     PM2.5 Strasbourg   35.874412
2024-02-06     PM2.5     Nantes   35.782817
2024-02-23     PM2.5 Strasbourg   35.730870
2024-01-13     PM2.5 Strasbourg   35.595652
2024-01-15     PM2.5     Nantes   35.589403
2024-01-28     PM2.5 Strasbourg   35.524545
2024-02-02     PM2.5     Nantes   35.334375
2024-02-13     PM2.5 Strasbourg   35.296133
2024-01-22     PM2.5     Nantes   35.223939

Top 10 journees les plus polluees - PM10:
      date pollutant       city  value_mean
2024-01-01      PM10     Nantes   59.971408
2024-01-06      PM10     Nantes   59.912537
2024-01-03      PM10     Nantes   59.876462
2024-01-23      PM10 Strasbourg   58.970000
2024-02-29      PM10 Strasbourg   58.208281
2024-01-29      PM10 Strasbourg   58.137500
2024-02-16      PM10 Strasbourg   58.098551
2024-01-25      PM10 Strasbourg   58.026053
2024-02-22      PM10     Nantes  

In [32]:
# Calculer un indice de pollution global (moyenne normalisee des polluants)
# Normaliser chaque polluant par son seuil d'information
df['ratio_seuil'] = df[value_col] / df['seuil_info']

# Indice journalier global
indice_journalier = df.groupby(['date', 'city'])['ratio_seuil'].mean().reset_index()
indice_journalier = indice_journalier.rename(columns={'ratio_seuil': 'indice_pollution'})

# Top 10 journees globales
top10_global = indice_journalier.nlargest(10, 'indice_pollution')
top10_global['indice_pollution'] = top10_global['indice_pollution'].round(3)

print("\nTop 10 journees les plus polluees (indice global):")
top10_global


Top 10 journees les plus polluees (indice global):


Unnamed: 0,date,city,indice_pollution
425,2024-02-12,Nantes,0.629
578,2024-02-27,Strasbourg,0.627
228,2024-01-23,Strasbourg,0.618
418,2024-02-11,Strasbourg,0.615
438,2024-02-13,Strasbourg,0.613
198,2024-01-20,Strasbourg,0.613
98,2024-01-10,Strasbourg,0.608
495,2024-02-19,Nantes,0.608
408,2024-02-10,Strasbourg,0.607
135,2024-01-14,Nantes,0.606


## 4.7 Sauvegarde des resultats

In [33]:
# Sauvegarder le tableau des depassements
depassements.to_csv(f"{OUTPUT_DIR}/depassements_seuils.csv")
print(f"Tableau des depassements sauvegarde: {OUTPUT_DIR}/depassements_seuils.csv")

# Sauvegarder la matrice de correlation
correlation_matrix.to_csv(f"{OUTPUT_DIR}/matrice_correlation.csv")
print(f"Matrice de correlation sauvegardee: {OUTPUT_DIR}/matrice_correlation.csv")

# Sauvegarder le top 10 global
top10_global.to_csv(f"{OUTPUT_DIR}/top10_journees_polluees.csv", index=False)
print(f"Top 10 journees sauvegarde: {OUTPUT_DIR}/top10_journees_polluees.csv")

Tableau des depassements sauvegarde: ../output/depassements_seuils.csv
Matrice de correlation sauvegardee: ../output/matrice_correlation.csv
Top 10 journees sauvegarde: ../output/top10_journees_polluees.csv


In [34]:
# Resume
print("RESUME DE L'ANALYSE EXPLORATOIRE")
print(f"Periode analysee: {df['date'].min()} a {df['date'].max()}")
print(f"Nombre de mesures: {len(df):,}")
print(f"Nombre de villes: {df['city'].nunique()}")
print(f"Nombre de stations: {df['station_id'].nunique()}")
print(f"\nDepassements du seuil d'information: {df['depassement_info'].sum():,}")
print(f"Depassements du seuil d'alerte: {df['depassement_alerte'].sum():,}")

RESUME DE L'ANALYSE EXPLORATOIRE
Periode analysee: 2024-01-01 a 2024-05-23
Nombre de mesures: 697,948
Nombre de villes: 10
Nombre de stations: 47

Depassements du seuil d'information: 86,422
Depassements du seuil d'alerte: 12,001
