In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
# Step 1: Load dataset
df = pd.read_csv("../../data/cleaned_weather.csv", encoding='utf-8')


In [3]:
df.head()

Unnamed: 0,location_id,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),apparent_temperature_mean (°C),daylight_duration (s),sunshine_duration (s),...,et0_fao_evapotranspiration (mm),latitude,longitude,elevation,year,month,day,sunrise_hour,sunset_hour,daylight_hours
0,0,1,30.1,22.6,26.0,34.5,25.0,29.0,42220.2,38905.73,...,4.61,6.924429,79.90725,4,2010,1,1,6,18,12
1,0,51,30.1,23.7,26.3,33.9,26.1,29.7,42225.71,37451.01,...,3.91,6.924429,79.90725,4,2010,2,1,6,18,12
2,0,51,29.6,23.1,26.0,34.5,26.2,29.9,42231.68,33176.43,...,3.66,6.924429,79.90725,4,2010,3,1,6,18,12
3,0,2,28.9,23.1,25.7,31.7,26.1,28.4,42238.11,38289.2,...,3.75,6.924429,79.90725,4,2010,4,1,6,18,12
4,0,1,28.1,21.3,24.6,30.0,22.9,26.2,42244.99,39113.82,...,5.0,6.924429,79.90725,4,2010,5,1,6,18,12


In [4]:
# Step 2: Clean and rename columns
df.columns = df.columns.str.strip()  # strip whitespace
df.rename(columns={
    'temperature_2m_mean (°C)': 'temperature',
    'rain_sum (mm)': 'rainfall',
    'wind_speed_10m_max (km/h)': 'wind_speed',
    'precipitation_sum (mm)': 'precipitation'
}, inplace=True)

In [5]:
# Step 3: Create datetime column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce')

In [7]:
# Step 4: Define variables and create folders for plots
variables = ['temperature', 'rainfall', 'wind_speed', 'precipitation']
variables = [v for v in variables if v in df.columns]
folders = ['eda_plots/time_series', 'eda_plots/correlation', 'eda_plots/seasonality_trends', 'eda_plots/outliers']
for folder in folders:
    os.makedirs(folder, exist_ok=True)

locations = df['location_id'].unique()
summary_list = []

In [8]:
print("\nPerforming EDA per location:")
for loc in tqdm(locations, desc="Processing locations"):
    try:
        city_df = df[df['location_id'] == loc].copy()
        city_df = city_df.sort_values('date')
        city_df.set_index('date', inplace=True)

        # Time series plots for each variable
        for var in variables:
            plt.figure(figsize=(12, 4))
            plt.plot(city_df[var])
            plt.title(f'{var.capitalize()} over time - Location {loc}')
            plt.xlabel("Date")
            plt.ylabel(var.capitalize())
            plt.tight_layout()
            plt.savefig(f"eda_plots/time_series/{loc}_{var}.png")
            plt.close()

        # Correlation heatmap
        corr = city_df[variables].corr()
        plt.figure(figsize=(6, 5))
        sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title(f'Correlation - Location {loc}')
        plt.tight_layout()
        plt.savefig(f"eda_plots/correlation/{loc}_correlation.png")
        plt.close()

        # Rolling mean and std for temperature
        rolling_mean = city_df['temperature'].rolling(window=30).mean()
        rolling_std = city_df['temperature'].rolling(window=30).std()
        plt.figure(figsize=(12, 5))
        plt.plot(city_df['temperature'], label='Temperature')
        plt.plot(rolling_mean, label='30-day Mean')
        plt.plot(rolling_std, label='30-day Std')
        plt.legend()
        plt.title(f'Temperature Seasonality - Location {loc}')
        plt.tight_layout()
        plt.savefig(f"eda_plots/seasonality_trends/{loc}_temperature_trend.png")
        plt.close()

        # Outlier detection boxplot
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=city_df[variables])
        plt.title(f'Outlier Boxplot - Location {loc}')
        plt.tight_layout()
        plt.savefig(f"eda_plots/outliers/{loc}_outliers_boxplot.png")
        plt.close()

        # Summary statistics
        stats = city_df[variables].describe().T
        stats['missing_values'] = city_df[variables].isnull().sum()
        stats['location_id'] = loc
        summary_list.append(stats.reset_index())

    except Exception as e:
        print(f"Error processing location {loc}: {e}")


Performing EDA per location:


Processing locations: 100%|██████████| 27/27 [00:46<00:00,  1.71s/it]


In [9]:
# Save summary statistics to CSV
if summary_list:
    all_summary_df = pd.concat(summary_list, ignore_index=True)
    all_summary_df.to_csv("eda_plots/summary_statistics_all_locations.csv", index=False)
    print("\nCity-wise EDA complete. All plots saved in 'eda_plots/' folder.")
    print("Summary statistics saved to 'eda_plots/summary_statistics_all_locations.csv'")
else:
    print("\nNo valid data processed. Nothing saved.")


City-wise EDA complete. All plots saved in 'eda_plots/' folder.
Summary statistics saved to 'eda_plots/summary_statistics_all_locations.csv'
