In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuration pour de meilleurs graphiques
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
class WeatherEDA:
    def __init__(self):
        """Initialise l'EDA avec des données d'exemple basées sur votre pipeline"""
        try:
            self.cities_data = self.load_processed_data()
        except (FileNotFoundError, ValueError):
            print("Aucun fichier trouvé, génération de données d'exemple...")
            self.cities_data = self.generate_sample_data()
        
    def load_processed_data(self):
        """Charge les données depuis le dossier data/processed"""
        import os
        import glob
        
        # Chemin vers le dossier des données traitées
        data_path = 'data/processed'
        
        # Rechercher tous les fichiers CSV dans le dossier
        csv_files = glob.glob(os.path.join(data_path, '*.csv'))
        
        if not csv_files:
            raise FileNotFoundError(f"Aucun fichier CSV trouvé dans {data_path}")
        
        # Charger et combiner tous les fichiers CSV
        all_dataframes = []
        for file in csv_files:
            try:
                df = pd.read_csv(file)
                all_dataframes.append(df)
                print(f"Fichier chargé: {os.path.basename(file)} - {len(df)} lignes")
            except Exception as e:
                print(f"Erreur lors du chargement de {file}: {e}")
        
        if not all_dataframes:
            raise ValueError("Aucun fichier CSV valide trouvé")
        
        # Combiner tous les DataFrames
        combined_df = pd.concat(all_dataframes, ignore_index=True)
        print(f"Total des données chargées: {len(combined_df)} lignes")
        
        return combined_df
    
    def generate_sample_data(self):
        """Génère des données d'exemple réalistes basées sur votre structure de code"""
        cities = [
            {"name": "Paris", "lat": 48.8566, "lon": 2.3522, "country": "FR"},
            {"name": "London", "lat": 51.5074, "lon": -0.1278, "country": "GB"},
            {"name": "New York", "lat": 40.7128, "lon": -74.0060, "country": "US"},
            {"name": "Tokyo", "lat": 35.6762, "lon": 139.6503, "country": "JP"},
            {"name": "Sydney", "lat": -33.8688, "lon": 151.2093, "country": "AU"},
            {"name": "Cairo", "lat": 30.0444, "lon": 31.2357, "country": "EG"},
            {"name": "Moscow", "lat": 55.7558, "lon": 37.6176, "country": "RU"},
            {"name": "Rio de Janeiro", "lat": -22.9068, "lon": -43.1729, "country": "BR"},
            {"name": "Mumbai", "lat": 19.0760, "lon": 72.8777, "country": "IN"},
            {"name": "Cape Town", "lat": -33.9249, "lon": 18.4241, "country": "ZA"}
        ]
        
        # Caractéristiques climatiques typiques par ville
        climate_profiles = {
            "Paris": {"temp_base": 12, "temp_var": 15, "humidity": 70, "aqi": 2.1},
            "London": {"temp_base": 11, "temp_var": 12, "humidity": 75, "aqi": 1.8},
            "New York": {"temp_base": 13, "temp_var": 20, "humidity": 65, "aqi": 2.3},
            "Tokyo": {"temp_base": 16, "temp_var": 18, "humidity": 68, "aqi": 2.0},
            "Sydney": {"temp_base": 18, "temp_var": 12, "humidity": 65, "aqi": 1.5},
            "Cairo": {"temp_base": 22, "temp_var": 15, "humidity": 45, "aqi": 3.2},
            "Moscow": {"temp_base": 6, "temp_var": 25, "humidity": 72, "aqi": 2.4},
            "Rio de Janeiro": {"temp_base": 24, "temp_var": 8, "humidity": 78, "aqi": 2.7},
            "Mumbai": {"temp_base": 27, "temp_var": 6, "humidity": 80, "aqi": 3.8},
            "Cape Town": {"temp_base": 17, "temp_var": 10, "humidity": 68, "aqi": 1.9}
        }
        
        # Générer 30 jours de données pour chaque ville
        all_data = []
        base_date = datetime.now() - timedelta(days=30)
        
        for i in range(30):
            date = base_date + timedelta(days=i)
            for city in cities:
                profile = climate_profiles[city["name"]]
                
                # Variation saisonnière basée sur la latitude
                seasonal_factor = np.sin((date.timetuple().tm_yday / 365.25) * 2 * np.pi)
                if city["lat"] < 0:  # Hémisphère sud
                    seasonal_factor = -seasonal_factor
                
                # Générer les données météo
                temp = profile["temp_base"] + seasonal_factor * profile["temp_var"] + np.random.normal(0, 3)
                humidity = max(20, min(95, profile["humidity"] + np.random.normal(0, 10)))
                pressure = 1013 + np.random.normal(0, 15)
                wind_speed = abs(np.random.gamma(2, 3))
                cloudiness = max(0, min(100, np.random.normal(50, 30)))
                visibility = max(1, min(20, np.random.normal(15, 5)))
                
                # AQI et composants de pollution
                aqi = max(1, min(5, profile["aqi"] + np.random.normal(0, 0.5)))
                pm25 = max(5, np.random.gamma(profile["aqi"], 15))
                pm10 = pm25 * np.random.uniform(1.2, 2.0)
                
                # Conditions météorologiques
                weather_conditions = ["Clear", "Clouds", "Rain", "Snow", "Mist", "Thunderstorm"]
                weather_weights = [0.3, 0.3, 0.2, 0.05, 0.1, 0.05]
                weather_main = np.random.choice(weather_conditions, p=weather_weights)
                
                # Calculer l'indice de confort
                comfort_score = self.calculate_comfort_index(temp, humidity, wind_speed)
                
                data_point = {
                    'city': city['name'],
                    'country': city['country'],
                    'lat': city['lat'],
                    'lon': city['lon'],
                    'date': date.strftime('%Y-%m-%d'),
                    'timestamp': date.isoformat(),
                    'temperature': round(temp, 1),
                    'feels_like': round(temp + np.random.normal(0, 2), 1),
                    'humidity': round(humidity, 1),
                    'pressure': round(pressure, 1),
                    'wind_speed': round(wind_speed, 1),
                    'wind_direction': np.random.randint(0, 360),
                    'cloudiness': round(cloudiness, 1),
                    'visibility': round(visibility, 1),
                    'weather_main': weather_main,
                    'aqi': round(aqi, 1),
                    'pm2_5': round(pm25, 1),
                    'pm10': round(pm10, 1),
                    'co': round(np.random.gamma(2, 200), 1),
                    'no2': round(np.random.gamma(2, 20), 1),
                    'o3': round(np.random.gamma(2, 50), 1),
                    'comfort_score': round(comfort_score, 2),
                    'season': self.get_season(date, city['lat'])
                }
                all_data.append(data_point)
        
        return pd.DataFrame(all_data)
    
    def calculate_comfort_index(self, temp, humidity, wind_speed):
        """Calcule l'indice de confort comme dans votre code"""
        heat_index = temp + 0.5 * (humidity - 60) / 100 * (temp - 20)
        wind_chill = temp - 0.5 * wind_speed
        return (heat_index + wind_chill) / 2
    
    def get_season(self, date, lat):
        """Détermine la saison comme dans votre code"""
        month = date.month
        if lat >= 0:  # Hémisphère nord
            if month in [12, 1, 2]: return "Hiver"
            elif month in [3, 4, 5]: return "Printemps"
            elif month in [6, 7, 8]: return "Été"
            else: return "Automne"
        else:  # Hémisphère sud
            if month in [12, 1, 2]: return "Été"
            elif month in [3, 4, 5]: return "Automne"
            elif month in [6, 7, 8]: return "Hiver"
            else: return "Printemps"
    
    def basic_stats(self):
        """Statistiques descriptives de base"""
        print("=== STATISTIQUES DESCRIPTIVES GÉNÉRALES ===")
        print(f"Période d'analyse: {self.cities_data['date'].min()} à {self.cities_data['date'].max()}")
        print(f"Nombre total d'observations: {len(self.cities_data)}")
        print(f"Nombre de villes: {self.cities_data['city'].nunique()}")
        print(f"Nombre de pays: {self.cities_data['country'].nunique()}")
        print("\n--- Statistiques par variable ---")
        
        numeric_cols = ['temperature', 'humidity', 'pressure', 'wind_speed', 'aqi', 'pm2_5', 'comfort_score']
        stats_df = self.cities_data[numeric_cols].describe()
        print(stats_df.round(2))
        
        return stats_df
    
    def run_complete_eda(self):
        """Exécute l'EDA complète"""
        print("🌍 ANALYSE EXPLORATOIRE DES DONNÉES MÉTÉOROLOGIQUES")
        print("=" * 60)
        
        # Analyses
        basic_results = self.basic_stats()
        
        print("\n✅ Analyse exploratoire terminée!")
        
        return {
            'basic_stats': basic_results,
            'raw_data': self.cities_data
        }


In [None]:
# Utilisation
if __name__ == "__main__":
    # Créer et exécuter l'EDA
    eda = WeatherEDA()
    results = eda.run_complete_eda()
    
    # Sauvegarder les résultats (optionnel)
    print("\n💾 Sauvegarde des données...")
    eda.cities_data.to_csv('weather_data_sample.csv', index=False)
    print("Données sauvegardées dans 'weather_data_sample.csv'")