# Weather Trend Forecasting – Exploratory Data Analysis

**PM Accelerator** | Data Science Assessment

---

This notebook performs comprehensive EDA on the Global Weather Repository dataset, including:
1. Data loading & inspection
2. Missing value analysis
3. Temperature & precipitation distributions
4. Time-series visualisations for major cities
5. Correlation heatmap
6. Anomaly detection (STL + Isolation Forest)
7. Spatial temperature map
8. Monthly climate comparison by continent

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='whitegrid', palette='muted', font_scale=1.1)
%matplotlib inline

from src.cleaning import run_cleaning, CLEAN_PATH
from src.anomalies import stl_anomaly_detection, isolation_forest_anomalies

## 1 – Load Cleaned Data

In [None]:
if CLEAN_PATH.exists():
    df = pd.read_parquet(CLEAN_PATH)
else:
    df = run_cleaning()

print(f'Shape: {df.shape}')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 2 – Missing Values

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if len(missing) > 0:
    fig, ax = plt.subplots(figsize=(10, 5))
    missing.plot.bar(ax=ax, color='salmon')
    ax.set_title('Missing Values per Column')
    ax.set_ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print('No missing values!')

## 3 – Temperature & Precipitation Distributions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
df['temperature_celsius'].dropna().hist(bins=60, ax=axes[0], color='steelblue', edgecolor='white')
axes[0].set_title('Temperature (°C) Distribution')
axes[0].set_xlabel('Temperature (°C)')

df['precip_mm'].dropna().hist(bins=60, ax=axes[1], color='teal', edgecolor='white')
axes[1].set_title('Precipitation (mm) Distribution')
axes[1].set_xlabel('Precipitation (mm)')
plt.tight_layout()
plt.show()

## 4 – Time Series for Major Cities

In [None]:
MAJOR_CITIES = ['London', 'New York', 'Tokyo', 'Sydney', 'Cairo']

fig, ax = plt.subplots(figsize=(14, 6))
for city in MAJOR_CITIES:
    sub = df[df['location_name'] == city].sort_values('date')
    if len(sub) == 0:
        continue
    ax.plot(sub['date'], sub['temperature_celsius'], label=city, alpha=0.8)
ax.set_title('Daily Temperature – Major Cities')
ax.set_xlabel('Date')
ax.set_ylabel('Temperature (°C)')
ax.legend()
plt.tight_layout()
plt.show()

## 5 – Correlation Heatmap

In [None]:
numeric = df.select_dtypes(include='number')
keep_cols = [
    'temperature_celsius', 'feels_like_celsius', 'humidity',
    'precip_mm', 'wind_kph', 'pressure_mb', 'cloud',
    'visibility_km', 'uv_index', 'gust_kph',
]
keep_cols = [c for c in keep_cols if c in numeric.columns]
corr = numeric[keep_cols].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', ax=ax)
ax.set_title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 6 – STL Anomaly Detection

In [None]:
city = 'London'
sub = df[df['location_name'] == city].sort_values('date').set_index('date')
ts = sub['temperature_celsius'].dropna()

stl_df = stl_anomaly_detection(ts, period=7)

fig, axes = plt.subplots(4, 1, figsize=(14, 10), sharex=True)
axes[0].plot(ts.index, ts.values, color='steelblue')
axes[0].set_ylabel('Observed')
axes[0].set_title(f'STL Decomposition – {city}')
axes[1].plot(stl_df.index, stl_df['trend'], color='orange')
axes[1].set_ylabel('Trend')
axes[2].plot(stl_df.index, stl_df['seasonal'], color='green')
axes[2].set_ylabel('Seasonal')
axes[3].plot(stl_df.index, stl_df['resid'], color='grey', alpha=0.6)
anom_idx = stl_df[stl_df['anomaly']].index
axes[3].scatter(anom_idx, stl_df.loc[anom_idx, 'resid'], color='red', zorder=5, label='Anomaly')
axes[3].set_ylabel('Residual')
axes[3].legend()
plt.tight_layout()
plt.show()

print(f'STL anomalies detected: {stl_df["anomaly"].sum()}')

## 7 – Isolation Forest Anomalies

In [None]:
df['iso_forest_anomaly'] = isolation_forest_anomalies(df)
n_anom = df['iso_forest_anomaly'].sum()
print(f'Isolation Forest anomalies: {n_anom} / {len(df)} ({100*n_anom/len(df):.2f}%)')

## 8 – Spatial Temperature Map

In [None]:
latest_date = df['date'].max()
snap = df[df['date'] == latest_date].drop_duplicates('location_name')

fig, ax = plt.subplots(figsize=(14, 7))
sc = ax.scatter(
    snap['longitude'], snap['latitude'],
    c=snap['temperature_celsius'], cmap='RdYlBu_r',
    s=20, alpha=0.7
)
plt.colorbar(sc, ax=ax, label='Temperature (°C)')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title(f'Global Temperature Map – {latest_date.date()}')
plt.tight_layout()
plt.show()

## 9 – Monthly Climate Comparison by Continent

In [None]:
# Lightweight continent mapping
from main import _country_to_continent
continent_map = _country_to_continent()
df['continent'] = df['country'].map(continent_map).fillna('Other')
df['month_num'] = df['date'].dt.month

monthly = df.groupby(['continent', 'month_num'])['temperature_celsius'].mean().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
for cont in monthly['continent'].unique():
    c = monthly[monthly['continent'] == cont]
    ax.plot(c['month_num'], c['temperature_celsius'], marker='o', label=cont)
ax.set_xlabel('Month')
ax.set_ylabel('Avg Temperature (°C)')
ax.set_title('Monthly Average Temperature by Continent')
ax.set_xticks(range(1, 13))
ax.legend(fontsize=8)
plt.tight_layout()
plt.show()

## 10 – Air Quality vs Weather Correlation

In [None]:
aq_cols = [c for c in df.columns if c.startswith('air_quality')]
weather_cols = ['temperature_celsius', 'humidity', 'wind_kph', 'precip_mm']
combined = [c for c in aq_cols + weather_cols if c in df.columns]

corr = df[combined].corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', ax=ax)
ax.set_title('Air Quality vs Weather – Correlation')
plt.tight_layout()
plt.show()