In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# === Load & prepare data ===
df = pd.read_csv("DATA_CSV/india_2000_2024_daily_weather.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['city', 'date'])

# Next-day targets
df['target_temp_max'] = df.groupby('city')['temperature_2m_max'].shift(-1)
df['target_wind_max'] = df.groupby('city')['wind_speed_10m_max'].shift(-1)
df['target_rain_sum'] = df.groupby('city')['rain_sum'].shift(-1)
df['target_weather_code'] = df.groupby('city')['weather_code'].shift(-1)

feature_cols = [
    'temperature_2m_max','temperature_2m_min',
    'apparent_temperature_max','apparent_temperature_min',
    'precipitation_sum','rain_sum','weather_code',
    'wind_speed_10m_max','wind_gusts_10m_max','wind_direction_10m_dominant'
]

# weather code map
weather_code_map = {
    0: 'Clear sky',
    1: 'Mainly clear',
    2: 'Partly cloudy',
    3: 'Overcast',
    45: 'Fog',
    48: 'Depositing rime fog',
    51: 'Drizzle: light',
    53: 'Drizzle: moderate',
    55: 'Drizzle: dense',
    56: 'Freezing drizzle: light',
    57: 'Freezing drizzle: dense',
    61: 'Rain: slight',
    63: 'Rain: moderate',
    65: 'Rain: heavy',
    66: 'Freezing rain: light',
    67: 'Freezing rain: heavy',
    71: 'Snow fall: slight',
    73: 'Snow fall: moderate',
    75: 'Snow fall: heavy',
    77: 'Snow grains',
    80: 'Rain showers: slight',
    81: 'Rain showers: moderate',
    82: 'Rain showers: violent',
    85: 'Snow showers: slight',
    86: 'Snow showers: heavy',
    95: 'Thunderstorm: slight/moderate',
    96: 'Thunderstorm with slight hail',
    99: 'Thunderstorm with heavy hail'
}

X = df[feature_cols].copy()
y_reg = df[['target_temp_max','target_wind_max','target_rain_sum']].copy()
y_cls = df['target_weather_code'].copy()

# Drop NaNs
mask = ~y_reg.isna().any(axis=1) & ~y_cls.isna()
X = X[mask].copy()
y_reg = y_reg[mask].copy()
y_cls = y_cls[mask].copy()

# Encode weather_code if string
if X['weather_code'].dtype == 'object':
    le = LabelEncoder()
    X['weather_code'] = le.fit_transform(X['weather_code'])

# If classification target string, encode (not needed if numeric)
if y_cls.dtype == 'object':
    le_cls = LabelEncoder()
    y_cls = le_cls.fit_transform(y_cls)

# Split
X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
    X, y_reg, y_cls, test_size=0.3, random_state=42
)

print("X_train:", X_train.shape)
print("y_reg_train:", y_reg_train.shape)
print("y_cls_train:", y_cls_train.shape)

# === 1. Multi-output regression (temp, wind, rain) ===
base_model = Ridge(alpha=1.0)
multi_reg = make_pipeline(StandardScaler(), MultiOutputRegressor(base_model))
multi_reg.fit(X_train, y_reg_train)
print("Regression R² per target:", multi_reg.score(X_test, y_reg_test))

# === 2. Classification (weather code) ===
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, multi_class='auto'))
clf.fit(X_train, y_cls_train)

y_cls_pred = clf.predict(X_test)
print("Classification accuracy:", accuracy_score(y_cls_test, y_cls_pred))
print("\nClassification report:\n", classification_report(y_cls_test, y_cls_pred))
print("Confusion matrix:\n", confusion_matrix(y_cls_test, y_cls_pred))

# === Predict on a new sample ===
new_data = pd.DataFrame([{
    'temperature_2m_max': 33,
    'temperature_2m_min': 26,
    'apparent_temperature_max': 35,
    'apparent_temperature_min': 27,
    'precipitation_sum': 0,
    'rain_sum': 0,
    'weather_code': 10,  # numeric
    'wind_speed_10m_max': 10,
    'wind_gusts_10m_max': 15,
    'wind_direction_10m_dominant': 180
}])

# Regression predictions
pred_reg = multi_reg.predict(new_data)[0]
print("\nPredicted next-day max temp:", pred_reg[0])
print("Predicted next-day wind speed:", pred_reg[1])
print("Predicted next-day rain sum:", pred_reg[2])

# Classification prediction (weather code)
pred_cls = clf.predict(new_data)[0]

# Map code to description if exists
desc = weather_code_map.get(pred_cls, f"Unknown code {pred_cls}")
print("Predicted next-day weather code:", pred_cls, "-", desc)


X_train: (63917, 10)
y_reg_train: (63917, 3)
y_cls_train: (63917,)
Regression R² per target: 0.6280532397379112




Classification accuracy: 0.3779797758551455

Classification report:
               precision    recall  f1-score   support

         0.0       0.47      0.63      0.54      4721
         1.0       0.17      0.00      0.01      2463
         2.0       0.00      0.00      0.00      2411
         3.0       0.36      0.58      0.44      6373
        51.0       0.32      0.53      0.40      3788
        53.0       0.37      0.03      0.05      2037
        55.0       0.00      0.00      0.00       731
        61.0       0.26      0.01      0.03      2057
        63.0       0.38      0.62      0.47      2468
        65.0       0.33      0.01      0.01       344

    accuracy                           0.38     27393
   macro avg       0.27      0.24      0.19     27393
weighted avg       0.31      0.38      0.30     27393

Confusion matrix:
 [[2995   10    0 1676   37    0    0    0    3    0]
 [ 775    7    2 1465  196    0    0    1   17    0]
 [ 675   10    0 1496  211    0    0    1   18 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
