# Code permettant de formater les données des stations d'AQICN

In [1]:
import pandas as pd
from datetime import datetime, timedelta

In [29]:
df_paris_1 = pd.read_csv("station_paris_2023.csv")
df_paris_2 = pd.read_csv("station_paris_2022.csv")
df_paris_3 = pd.read_csv("station_paris_2021.csv")
df_paris = pd.concat([df_paris_1, df_paris_2, df_paris_3], ignore_index=True, sort=False)
df_paris = df_paris.drop(['Unnamed: 0'], axis=1)
df_paris['Station'] = 'Île-de-France'
df_ile_de_france = df_paris

In [31]:
df_ile_de_france['Date'] = pd.to_datetime(df_ile_de_france['Date'])

# Créer une plage de dates entre 2021 et 2023
date_debut = datetime(2021, 1, 1)
date_fin = datetime(2023, 12, 31)
plage_dates = pd.date_range(date_debut, date_fin)

# Créer un nouveau dataframe avec la plage de dates
df_dates = pd.DataFrame({'Date': plage_dates})

# Fusionner le DataFrame initial avec la nouvelle plage de dates
df_ile_de_france_final = pd.merge(df_dates, df_ile_de_france, on='Date', how='left')
# On supprime les valeurs en double
df_ile_de_france_final = df_ile_de_france_final.drop_duplicates()

# On remplace les valeurs manquantes par la moyenne des valeurs de qualité de l'air
moyenne_pm25 = df_ile_de_france_final['PM 2.5'].mean()
df_ile_de_france_final['PM 2.5'] = df_ile_de_france_final['PM 2.5'].fillna(moyenne_pm25)

print(df_ile_de_france_final)
df_ile_de_france_final.to_csv("ile_de_france_pm2.5.csv")

           Date        Station  PM 2.5
0    2021-01-01  Île-de-France    61.0
1    2021-01-02  Île-de-France    94.0
2    2021-01-03  Île-de-France   125.0
3    2021-01-04  Île-de-France   109.0
4    2021-01-05  Île-de-France    76.0
...         ...            ...     ...
1347 2023-12-27  Île-de-France    41.0
1348 2023-12-28  Île-de-France    45.0
1349 2023-12-29  Île-de-France    41.0
1350 2023-12-30  Île-de-France    34.0
1351 2023-12-31  Île-de-France    34.0

[1095 rows x 3 columns]


In [33]:
df_brest = pd.read_csv("station_brest_2020-2023.csv")
df_brest

Unnamed: 0,date,min,max,median,q1,q3,stdev,count
0,2020-04-18T00:00:00.000Z,2.08,35.78,9.51,5.53,10.92,5.043,134
1,2020-04-19T00:00:00.000Z,0.84,257.02,10.50,2.76,14.06,17.660,534
2,2020-04-20T00:00:00.000Z,5.20,156.57,60.94,42.72,90.60,37.818,495
3,2020-04-21T00:00:00.000Z,1.81,173.72,7.48,2.90,134.04,64.415,348
4,2020-04-22T00:00:00.000Z,2.74,47.80,6.19,3.58,12.16,5.697,388
...,...,...,...,...,...,...,...,...
1107,2023-12-26T00:00:00.000Z,0.80,38.90,5.29,3.75,7.50,4.143,486
1108,2023-12-27T00:00:00.000Z,2.58,48.60,6.43,5.53,7.10,2.787,578
1109,2023-12-28T00:00:00.000Z,4.17,18.73,6.47,5.74,7.30,1.564,576
1110,2023-12-29T00:00:00.000Z,2.90,39.53,5.70,5.03,6.49,2.986,561


In [None]:
df_brest['date'] = pd.to_datetime(df_brest['date'])
df_brest['date'] = df_brest['date'].dt.strftime('%Y-%m-%d')
df_brest = df_brest.drop(['min', 'max', 'q1', 'q3', 'stdev', 'count', 'Jour'], axis=1)

In [39]:
df_brest['Station'] = 'Bretagne'
df_brest

Unnamed: 0,date,median,Station
0,2020-04-18,9.51,Bretagne
1,2020-04-19,10.50,Bretagne
2,2020-04-20,60.94,Bretagne
3,2020-04-21,7.48,Bretagne
4,2020-04-22,6.19,Bretagne
...,...,...,...
1107,2023-12-26,5.29,Bretagne
1108,2023-12-27,6.43,Bretagne
1109,2023-12-28,6.47,Bretagne
1110,2023-12-29,5.70,Bretagne


In [40]:
df_brest['date'] = pd.to_datetime(df_brest['date'])

start_date = '2021-01-01'
end_date = '2023-12-31'
#On ne veut les valeurs que dans cet intervalle ci-dessus
df_brest = df_brest[(df_brest['date'] >= start_date) & (df_brest['date'] <= end_date)]
print(df_brest)

           date  median   Station
253  2021-01-01    3.97  Bretagne
254  2021-01-02    2.50  Bretagne
255  2021-01-03    2.10  Bretagne
256  2021-01-04    8.07  Bretagne
257  2021-01-05    7.18  Bretagne
...         ...     ...       ...
1107 2023-12-26    5.29  Bretagne
1108 2023-12-27    6.43  Bretagne
1109 2023-12-28    6.47  Bretagne
1110 2023-12-29    5.70  Bretagne
1111 2023-12-30    4.03  Bretagne

[859 rows x 3 columns]


In [50]:
date_debut = datetime(2021, 1, 1)
date_fin = datetime(2023, 12, 31)
plage_dates = pd.date_range(date_debut, date_fin)

df_dates = pd.DataFrame({'date': plage_dates})

df_brest_final = pd.merge(df_dates, df_brest, on='date', how='left')
df_brest_final = df_brest_final.drop_duplicates()

#On remplit les valeurs vides avec la moyenne des valeurs
moyenne_pm25 = df_brest_final['median'].mean()
df_brest_final['median'] = df_brest_final['median'].fillna(moyenne_pm25)
df_brest_final['Station']='Bretagne'
df_brest_final = df_brest_final.rename(columns={"median": "PM 2.5"})
df_brest_final.to_csv("bretagne_pm2.5.csv")

In [18]:
df_nantes = pd.read_csv("station_nantes_2020_2023.csv")
df_nantes['date'] = pd.to_datetime(df_nantes['date'])
df_nantes['date'] = df_nantes['date'].dt.strftime('%Y-%m-%d')
df_nantes = df_nantes.drop(['min', 'max', 'q1', 'q3', 'stdev', 'count'], axis=1)
df_nantes['date'] = pd.to_datetime(df_nantes['date'])
start_date = '2021-01-01'
end_date = '2023-12-31'
df_nantes = df_nantes[(df_nantes['date'] >= start_date) & (df_nantes['date'] <= end_date)]
date_debut = datetime(2021, 1, 1)
date_fin = datetime(2023, 12, 31)
plage_dates = pd.date_range(date_debut, date_fin)
df_dates = pd.DataFrame({'date': plage_dates})
df_nantes_final = pd.merge(df_dates, df_nantes, on='date', how='left')
df_nantes_final = df_nantes_final.drop_duplicates()
moyenne_pm25 = df_nantes_final['median'].mean()
df_nantes_final['median'] = df_nantes_final['median'].fillna(moyenne_pm25)
df_nantes_final['Station']="Pays de la Loire"
df_nantes_final = df_nantes_final.rename(columns={"median": "PM 2.5"})
df_nantes_final.to_csv("pays_de_la_loire_pm2.5.csv")
df_nantes_final

Unnamed: 0,date,PM 2.5,Station
0,2021-01-01,15.63,Pays de la Loire
1,2021-01-02,15.97,Pays de la Loire
2,2021-01-03,15.93,Pays de la Loire
3,2021-01-04,22.39,Pays de la Loire
4,2021-01-05,16.37,Pays de la Loire
...,...,...,...
1090,2023-12-27,4.72,Pays de la Loire
1091,2023-12-28,2.57,Pays de la Loire
1092,2023-12-29,2.40,Pays de la Loire
1093,2023-12-30,1.60,Pays de la Loire


In [19]:
df_rouen = pd.read_csv("station_rouen_2020_2023.csv")
df_rouen

Unnamed: 0,date,pm25,pm10,o3,no2,so2
0,2024/1/1,25,4,30,3,
1,2024/1/2,13,10,29,4,
2,2024/1/3,29,13,33,7,
3,2024/1/4,35,9,28,10,
4,2024/1/5,25,9,20,8,
...,...,...,...,...,...,...
3319,2015/11/4,,,9,17,1
3320,2015/7/3,,,48,9,1
3321,2022/11/4,,,,10,
3322,2022/11/5,,,,5,


In [20]:
df_rouen = pd.read_csv("station_rouen_2020_2023.csv")
df_rouen['date'] = pd.to_datetime(df_rouen['date'])
start_date = '2021-01-01'
end_date = '2023-12-31'
df_rouen = df_rouen[(df_rouen['date'] >= start_date) & (df_rouen['date'] <= end_date)]
df_rouen = df_rouen.drop([' pm10',' o3',' no2', ' so2'], axis=1)

date_debut = datetime(2021, 1, 1)
date_fin = datetime(2023, 12, 31)
plage_dates = pd.date_range(date_debut, date_fin)

df_dates = pd.DataFrame({'date': plage_dates})

df_rouen_final = pd.merge(df_dates, df_rouen, on='date', how='left')
df_rouen_final = df_rouen_final.drop_duplicates()
df_rouen_final[' pm25'] = pd.to_numeric(df_rouen_final[' pm25'], errors='coerce')
moyenne_pm25 = df_rouen_final[' pm25'].mean()
df_rouen_final[' pm25'] = df_rouen_final[' pm25'].fillna(moyenne_pm25)
df_rouen_final['Station']="Normandie"
df_rouen_final = df_rouen_final.rename(columns={" pm25": "PM 2.5"})
df_rouen_final.to_csv("normandie_pm2.5.csv")
df_rouen_final

Unnamed: 0,date,PM 2.5,Station
0,2021-01-01,122.0,Normandie
1,2021-01-02,124.0,Normandie
2,2021-01-03,93.0,Normandie
3,2021-01-04,72.0,Normandie
4,2021-01-05,87.0,Normandie
...,...,...,...
1090,2023-12-27,33.0,Normandie
1091,2023-12-28,32.0,Normandie
1092,2023-12-29,29.0,Normandie
1093,2023-12-30,31.0,Normandie


### Il existe trois différentes façons de formater les données car les données n'étaient pas toutes de la même forme sur le site AQICN