In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
# Load cleaned weather dataset
df = pd.read_csv('cleaned_weather_data.csv', parse_dates=['date'])

In [3]:
df.head()

Unnamed: 0,date,location_id,temperature,rainfall,wind_speed,precipitation
0,2010-01-01,0,26.0,0.0,12.2,0.0
1,2010-01-02,0,26.4,15.7,11.6,15.7
2,2010-01-03,0,28.0,0.0,13.0,0.0
3,2010-01-04,0,27.5,7.4,13.6,7.4
4,2010-01-05,0,27.7,9.6,10.3,9.6


In [4]:
# Variables to analyze
variables = ['temperature', 'rainfall', 'wind_speed', 'precipitation']

In [5]:
# Create folders to save EDA plots
os.makedirs('eda_plots/time_series', exist_ok=True)
os.makedirs('eda_plots/correlation', exist_ok=True)
os.makedirs('eda_plots/seasonality_trends', exist_ok=True)
os.makedirs('eda_plots/outliers', exist_ok=True)

In [6]:
# Get unique location_ids
locations = df['location_id'].unique()
summary_list = []

In [7]:
# Loop over each location using progress bar
for loc in tqdm(locations, desc="Performing EDA per location"):
    try:
        print(f"\nProcessing Location: {loc}")
        city_df = df[df['location_id'] == loc].copy()
        city_df.set_index('date', inplace=True)

        # ---- 1. Time Series Plots ----
        for var in variables:
            plt.figure(figsize=(12, 4))
            plt.plot(city_df[var])
            plt.title(f"{var.capitalize()} over time - Location: {loc}")
            plt.xlabel('Date')
            plt.ylabel(var.capitalize())
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(f"eda_plots/time_series/{loc}_{var}.png")
            plt.close()

        # ---- 2. Correlation Matrix ----
        plt.figure(figsize=(8, 6))
        corr = city_df[variables].corr()
        sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title(f'Correlation Matrix - Location: {loc}')
        plt.tight_layout()
        plt.savefig(f"eda_plots/correlation/{loc}_correlation.png")
        plt.close()

        # ---- 3. Summary Statistics + Missing Value Count ----
        summary_stats = city_df[variables].describe().T
        summary_stats['missing_values'] = city_df[variables].isnull().sum()
        summary_stats['location'] = loc
        summary_list.append(summary_stats.reset_index())

        # Optional: Warn if missing values found
        if summary_stats['missing_values'].sum() > 0:
            print(f"Missing values found for location {loc}")

        # ---- 4. Rolling Trend Analysis (Temperature) ----
        window = 30
        rolling_mean = city_df['temperature'].rolling(window=window).mean()
        rolling_std = city_df['temperature'].rolling(window=window).std()

        plt.figure(figsize=(12, 5))
        plt.plot(city_df['temperature'], label='Temperature')
        plt.plot(rolling_mean, label=f'{window}-day Rolling Mean')
        plt.plot(rolling_std, label=f'{window}-day Rolling Std')
        plt.title(f'Temperature Trend & Seasonality - Location: {loc}')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"eda_plots/seasonality_trends/{loc}_temperature_trend.png")
        plt.close()

        # ---- 5. Outlier Detection (Boxplot) ----
        plt.figure(figsize=(12, 6))
        sns.boxplot(data=city_df[variables])
        plt.title(f'Outlier Detection (Boxplot) - Location: {loc}')
        plt.tight_layout()
        plt.savefig(f"eda_plots/outliers/{loc}_outliers_boxplot.png")
        plt.close()

    except Exception as e:
        print(f"Error processing location {loc}: {e}")

Performing EDA per location:   0%|          | 0/27 [00:00<?, ?it/s]


Processing Location: 0


Performing EDA per location:   4%|▎         | 1/27 [00:01<00:36,  1.39s/it]


Processing Location: 1


Performing EDA per location:   7%|▋         | 2/27 [00:02<00:33,  1.33s/it]


Processing Location: 2


Performing EDA per location:  11%|█         | 3/27 [00:03<00:31,  1.31s/it]


Processing Location: 3


Performing EDA per location:  15%|█▍        | 4/27 [00:05<00:31,  1.38s/it]


Processing Location: 4


Performing EDA per location:  19%|█▊        | 5/27 [00:06<00:30,  1.37s/it]


Processing Location: 5


Performing EDA per location:  22%|██▏       | 6/27 [00:08<00:30,  1.46s/it]


Processing Location: 6


Performing EDA per location:  26%|██▌       | 7/27 [00:10<00:30,  1.50s/it]


Processing Location: 7


Performing EDA per location:  30%|██▉       | 8/27 [00:11<00:27,  1.43s/it]


Processing Location: 8


Performing EDA per location:  33%|███▎      | 9/27 [00:12<00:25,  1.39s/it]


Processing Location: 9


Performing EDA per location:  37%|███▋      | 10/27 [00:14<00:24,  1.42s/it]


Processing Location: 10


Performing EDA per location:  41%|████      | 11/27 [00:15<00:22,  1.39s/it]


Processing Location: 11


Performing EDA per location:  44%|████▍     | 12/27 [00:16<00:20,  1.34s/it]


Processing Location: 12


Performing EDA per location:  48%|████▊     | 13/27 [00:18<00:19,  1.41s/it]


Processing Location: 13


Performing EDA per location:  52%|█████▏    | 14/27 [00:19<00:17,  1.36s/it]


Processing Location: 14


Performing EDA per location:  56%|█████▌    | 15/27 [00:20<00:16,  1.35s/it]


Processing Location: 15


Performing EDA per location:  59%|█████▉    | 16/27 [00:22<00:14,  1.33s/it]


Processing Location: 16


Performing EDA per location:  63%|██████▎   | 17/27 [00:23<00:13,  1.33s/it]


Processing Location: 17


Performing EDA per location:  67%|██████▋   | 18/27 [00:24<00:11,  1.31s/it]


Processing Location: 18


Performing EDA per location:  70%|███████   | 19/27 [00:26<00:10,  1.35s/it]


Processing Location: 19


Performing EDA per location:  74%|███████▍  | 20/27 [00:27<00:09,  1.37s/it]


Processing Location: 20


Performing EDA per location:  78%|███████▊  | 21/27 [00:28<00:08,  1.34s/it]


Processing Location: 21


Performing EDA per location:  81%|████████▏ | 22/27 [00:30<00:06,  1.32s/it]


Processing Location: 22


Performing EDA per location:  85%|████████▌ | 23/27 [00:31<00:05,  1.31s/it]


Processing Location: 23


Performing EDA per location:  89%|████████▉ | 24/27 [00:32<00:03,  1.31s/it]


Processing Location: 24


Performing EDA per location:  93%|█████████▎| 25/27 [00:34<00:02,  1.32s/it]


Processing Location: 25


Performing EDA per location:  96%|█████████▋| 26/27 [00:35<00:01,  1.33s/it]


Processing Location: 26


Performing EDA per location: 100%|██████████| 27/27 [00:37<00:00,  1.37s/it]


In [8]:
# ---- 6. Combine & Save Summary Stats for All Locations ----
all_summary_df = pd.concat(summary_list, ignore_index=True)
all_summary_df.to_csv('eda_plots/summary_statistics_all_locations.csv', index=False)

print("\nEDA complete!")
print("All plots saved inside the 'eda_plots' folder.")
print("Summary statistics saved to: 'eda_plots/summary_statistics_all_locations.csv'")



EDA complete!
All plots saved inside the 'eda_plots' folder.
Summary statistics saved to: 'eda_plots/summary_statistics_all_locations.csv'
