In [1]:
import requests
import pandas as pd
from datetime import datetime

API_KEY = '3aec9b159c2eb5f02e5b8311dd5ab4c9'
BASE_URL = 'http://api.openweathermap.org/data/2.5/air_pollution'

city_coords = {
    'Los Angeles': {'lat': 34.0522, 'lon': -118.2437},
    'Paris': {'lat': 48.8566, 'lon': 2.3522},
    'Tokyo': {'lat': 35.6895, 'lon': 139.6917},
    'Antananarivo': {'lat': -18.8792, 'lon': 47.5079},
    'Nairobi': {'lat': -1.286389, 'lon': 36.817223},
    'Lima': {'lat': -12.0464, 'lon': -77.0428},
}

def fetch_pollution_data(lat, lon, api_key):
    try:
        response = requests.get(f"{BASE_URL}?lat={lat}&lon={lon}&appid={api_key}")
        response.raise_for_status()  # Vérifie les erreurs HTTP
        data = response.json()
        if 'list' in data and len(data['list']) > 0:
            return data['list'][0]
        else:
            return None
    except requests.RequestException as e:
        print(f"Erreur lors de la requête pour lat: {lat}, lon: {lon} - {e}")
        return None

def normalize(value, min_value, max_value):
    return (value - min_value) / (max_value - min_value)

pollution_data = []

for city, coords in city_coords.items():
    data = fetch_pollution_data(coords['lat'], coords['lon'], API_KEY)
    if data:
        pollution_entry = {
            'City': city,
            'Datetime': datetime.utcnow(),
            'AQI': data['main']['aqi'],
            'PM2.5': data['components']['pm2_5'],
            'PM10': data['components']['pm10'],
            'O3': data['components']['o3'],
            'NO2': data['components']['no2'],
            'SO2': data['components']['so2'],
            'CO': data['components']['co']
        }
        pollution_data.append(pollution_entry)

# Conversion en DataFrame
df_pollution = pd.DataFrame(pollution_data)

# Nettoyage des données
# 1. Vérification de la cohérence des données
for component in ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']:
    df_pollution = df_pollution[df_pollution[component] >= 0]

# 2. Gestion des valeurs manquantes
df_pollution.dropna(inplace=True)

# 3. Suppression des doublons
df_pollution.drop_duplicates(inplace=True)

# 4. Vérification des valeurs aberrantes
def is_outlier(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (series < lower_bound) | (series > upper_bound)

outliers = df_pollution[['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']].apply(is_outlier)
df_pollution = df_pollution[~(outliers.any(axis=1))]

# Normalisation des données de pollution
for component in ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']:
    df_pollution[component] = df_pollution[component].apply(lambda x: normalize(x, df_pollution[component].min(), df_pollution[component].max()))

# Sauvegarde des données nettoyées et transformées
df_pollution.to_csv('pollution_data_cleaned.csv', index=False)

print("Données nettoyées, transformées et enregistrées dans pollution_data_cleaned.csv")


Données nettoyées, transformées et enregistrées dans pollution_data_cleaned.csv
