In [17]:
import numpy as np
import pandas as pd
from dateutil import parser
import pyarrow.dataset as ds

In [18]:
# Charger `weather_raw.csv` avec Pandas.
weather = pd.read_csv("data/weather_raw.csv")
weather.info()
weather.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42172 entries, 0 to 42171
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city               42172 non-null  object 
 1   timestamp          42172 non-null  object 
 2   temperature_c      41724 non-null  object 
 3   humidity_pct       41760 non-null  float64
 4   wind_speed_kmh     42172 non-null  float64
 5   precipitation_mm   42172 non-null  float64
 6   weather_condition  41310 non-null  object 
dtypes: float64(3), object(4)
memory usage: 2.3+ MB


Unnamed: 0,city,timestamp,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition
0,Grenoble,2024-03-12 09:00:00,14.8,74.6,24.8,2.4,pluvieux
1,Strasbourg,05/04/2024 21:00,10.8,78.4,5.7,0.0,neigeux
2,Marseille,28/01/2024 23:00,13.3,60.4,45.7,3.5,pluvieux
3,Bordeaux,29/01/2024 12:00,1.3,84.1,43.3,0.0,brumeux
4,Marseille,2024-01-15 00:00:00,13.3,87.4,21.7,6.0,orageux


In [19]:
weather["temperature_c"] = weather["temperature_c"].str.replace(",", ".")
weather["temperature_c"] = weather["temperature_c"].astype(float)

# Standardiser les formats de dates.
weather["timestamp"] = weather["timestamp"].apply(lambda ts_str: parser.parse(ts_str))

weather.info()
weather.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42172 entries, 0 to 42171
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   city               42172 non-null  object        
 1   timestamp          42172 non-null  datetime64[ns]
 2   temperature_c      41724 non-null  float64       
 3   humidity_pct       41760 non-null  float64       
 4   wind_speed_kmh     42172 non-null  float64       
 5   precipitation_mm   42172 non-null  float64       
 6   weather_condition  41310 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 2.3+ MB


Unnamed: 0,city,timestamp,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition
0,Grenoble,2024-03-12 09:00:00,14.8,74.6,24.8,2.4,pluvieux
1,Strasbourg,2024-05-04 21:00:00,10.8,78.4,5.7,0.0,neigeux
2,Marseille,2024-01-28 23:00:00,13.3,60.4,45.7,3.5,pluvieux
3,Bordeaux,2024-01-29 12:00:00,1.3,84.1,43.3,0.0,brumeux
4,Marseille,2024-01-15 00:00:00,13.3,87.4,21.7,6.0,orageux


In [20]:
# Identifier et traiter les valeurs manquantes :
weather.isna().sum()

city                   0
timestamp              0
temperature_c        448
humidity_pct         412
wind_speed_kmh         0
precipitation_mm       0
weather_condition    862
dtype: int64

In [21]:
#  - Interpolation linéaire pour température et humidité.
weather_cleaned = weather.copy()

weather_cleaned = weather_cleaned.sort_values(["city", "timestamp"])
weather_cleaned["temperature_c"] = weather_cleaned["temperature_c"].interpolate(method="linear")
weather_cleaned["humidity_pct"] = weather_cleaned["humidity_pct"].interpolate(method="linear")

#  - Forward fill pour les conditions météo.
weather_cleaned["weather_condition"] = weather_cleaned["weather_condition"].ffill()
weather_cleaned.isna().sum()

city                 0
timestamp            0
temperature_c        0
humidity_pct         0
wind_speed_kmh       0
precipitation_mm     0
weather_condition    0
dtype: int64

In [22]:
# Corriger les valeurs aberrantes :
#  - Températures hors [-40, 50] : remplacer par NaN puis interpoler.
weather_cleaned[(weather_cleaned["temperature_c"] < -40) | (weather_cleaned["temperature_c"] > 50)] = np.nan
weather_cleaned["temperature_c"] = weather_cleaned["temperature_c"].interpolate(method="linear")

#  - Humidité hors [0, 100] : clipper.
weather_cleaned["humidity_pct"] = weather_cleaned["humidity_pct"].clip(lower=0, upper=100)
weather_cleaned.head()

Unnamed: 0,city,timestamp,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition
6149,Bordeaux,2024-01-01 00:00:00,11.1,64.7,1.0,4.1,pluvieux
3877,Bordeaux,2024-01-01 01:00:00,8.75,68.3,44.4,0.0,ensoleille
5918,Bordeaux,2024-01-01 02:00:00,6.4,80.2,28.5,0.0,brumeux
22424,Bordeaux,2024-01-01 03:00:00,6.6,87.0,43.7,0.0,brumeux
16306,Bordeaux,2024-01-01 04:00:00,1.7,45.9,21.5,0.0,pluvieux


In [23]:
# Fusionner avec les données de pollution (jointure sur ville et heure arrondie).

# air_quality = pd.read_csv("C:/Users/Administrateur/Documents/M2i_CDSD_TDTP/spark/TP_Analyse_Qualite_Air/data/air_quality_clean.csv")
dataset = ds.dataset(
    "output/air_quality_clean",
    format="parquet",
)
air_quality = dataset.to_table().to_pandas()
# df_polluted = pd.read_parquet("output/air_quality_clean", engine="pyarrow")

stations = pd.read_csv("data/stations.csv")
air_quality_stations = air_quality.merge(stations, on="station_id", how="left")
air_quality_stations_weather = air_quality_stations.merge(weather_cleaned, on=["city", "timestamp"], how="left")
air_quality_stations_weather.to_csv("output/pollution_meteo_clean.csv", header=True, index=False)

In [24]:
# Rapport de nettoyage (avant/après par colonne)
print("\nRAPPORT")
for column in weather.columns:
    count_weather = weather[column].count()
    count_weather_cleaned = weather_cleaned[column].count()
    delta = count_weather - count_weather_cleaned
    print(f" Nombre de lignes dans la colonne {column} du dataset weather_raw : ", count_weather)
    if delta > 0:
        print(f" Nombre de lignes supprimées dans la colonne {column} : ", delta)
    else:
        print(f" Nombre de lignes ajoutées dans la colonne {column} : ", -delta)
    print(f" Nombre de lignes dans la colonne {column} du dataset weather_clean : ", count_weather_cleaned)
    print("")


RAPPORT
 Nombre de lignes dans la colonne city du dataset weather_raw :  42172
 Nombre de lignes supprimées dans la colonne city :  458
 Nombre de lignes dans la colonne city du dataset weather_clean :  41714

 Nombre de lignes dans la colonne timestamp du dataset weather_raw :  42172
 Nombre de lignes supprimées dans la colonne timestamp :  458
 Nombre de lignes dans la colonne timestamp du dataset weather_clean :  41714

 Nombre de lignes dans la colonne temperature_c du dataset weather_raw :  41724
 Nombre de lignes ajoutées dans la colonne temperature_c :  448
 Nombre de lignes dans la colonne temperature_c du dataset weather_clean :  42172

 Nombre de lignes dans la colonne humidity_pct du dataset weather_raw :  41760
 Nombre de lignes supprimées dans la colonne humidity_pct :  46
 Nombre de lignes dans la colonne humidity_pct du dataset weather_clean :  41714

 Nombre de lignes dans la colonne wind_speed_kmh du dataset weather_raw :  42172
 Nombre de lignes supprimées dans la co