In [None]:
# ==========================
# 1. Import libraries
# ==========================
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, f1_score, classification_report, confusion_matrix
)

# ==========================
# 2. Load & preprocess data
# ==========================
import pandas as pd

# Read your dataset
df = pd.read_csv("../data/weatherHistory.csv")

# Force parsing with timezone handling
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], errors='coerce', utc=True)

# Now convert to local timezone if needed (optional, e.g. UTC to IST)
# df['Formatted Date'] = df['Formatted Date'].dt.tz_convert('Asia/Kolkata')

# Remove timezone information if you don’t want tz-aware datetimes
df['Formatted Date'] = df['Formatted Date'].dt.tz_localize(None)

# Now safely extract datetime features
df['year'] = df['Formatted Date'].dt.year
df['month'] = df['Formatted Date'].dt.month
df['day'] = df['Formatted Date'].dt.day
df['hour'] = df['Formatted Date'].dt.hour


# Create synthetic thunderstorm label
df['thunderstorm'] = np.where(
    (df['Humidity'] > 0.85) &
    (df['Wind Speed (km/h)'] > 25) &
    (df['Pressure (millibars)'] < 1005),
    1, 0
)

# Drop unused columns
df = df.drop(columns=['Summary', 'Daily Summary', 'Loud Cover'], errors='ignore')

# Handle categorical column
if 'Precip Type' in df.columns:
    df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)
    df = pd.get_dummies(df, columns=['Precip Type'], drop_first=True)

# ==========================
# 3. Train/Val/Test split
# ==========================
train_df = df[(df['year'] >= 2006) & (df['year'] <= 2012)].copy()
val_df   = df[(df['year'] > 2012) & (df['year'] <= 2014)].copy()
test_df  = df[(df['year'] > 2014) & (df['year'] <= 2016)].copy()

# ==========================
# 4. Target Engineering (next hour)
# ==========================
for split in [train_df, val_df, test_df]:
    split['Wind_Speed_next_hour'] = split['Wind Speed (km/h)'].shift(-1)
    split['Thunderstorm_next_hour'] = split['thunderstorm'].shift(-1)

# ==========================
# 5. Feature Engineering
# ==========================
def add_features(df):
    # Previous hour values
    df['Wind_Speed_prev_1h'] = df['Wind Speed (km/h)'].shift(1)
    df['Temperature_prev_1h'] = df['Temperature (C)'].shift(1)
    df['Humidity_prev_1h'] = df['Humidity'].shift(1)
    df['Pressure_prev_1h'] = df['Pressure (millibars)'].shift(1)
    df['Thunderstorm_prev_1h'] = df['thunderstorm'].shift(1)

    # Rolling mean (3h)
    df['Wind_Speed_roll3h'] = df['Wind Speed (km/h)'].shift(1).rolling(3).mean()
    df['Temperature_roll3h'] = df['Temperature (C)'].shift(1).rolling(3).mean()

    # Cyclical time encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

    return df.dropna().reset_index(drop=True)

train_df = add_features(train_df)
val_df   = add_features(val_df)
test_df  = add_features(test_df)

# ==========================
# 6. Feature columns
# ==========================
numerical_features = [
    'Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
    'Pressure (millibars)', 'Visibility (km)', 'Wind Bearing (degrees)',
    'Wind_Speed_prev_1h', 'Temperature_prev_1h', 'Humidity_prev_1h',
    'Pressure_prev_1h', 'Thunderstorm_prev_1h',
    'Wind_Speed_roll3h', 'Temperature_roll3h',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos'
]
categorical_features = [col for col in df.columns if col.startswith("Precip Type_")]
feature_columns = numerical_features + categorical_features

# ==========================
# 7. Models
# ==========================
model_wind = RandomForestRegressor(
    n_estimators=150, max_depth=15, min_samples_split=5,
    min_samples_leaf=2, max_features='sqrt',
    random_state=42, n_jobs=-1
)

model_thunder = RandomForestClassifier(
    n_estimators=150, max_depth=15, min_samples_split=5,
    min_samples_leaf=2, max_features='sqrt',
    class_weight='balanced', random_state=42, n_jobs=-1
)

# ==========================
# 8. Train
# ==========================
X_wind = train_df[feature_columns]
y_wind = train_df['Wind_Speed_next_hour']
model_wind.fit(X_wind, y_wind)

X_thunder = train_df[feature_columns]
y_thunder = train_df['Thunderstorm_next_hour']
model_thunder.fit(X_thunder, y_thunder)

# ==========================
# 9. Evaluation Functions
# ==========================
def evaluate_regression(y_true, y_pred, label="Regression"):
    print(f"\n{label} Results:")
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)   # <-- manual RMSE
    r2 = r2_score(y_true, y_pred)

    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R²:", r2)

    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def evaluate_classification(y_true, y_pred, label="Classification"):
    print(f"\n{label} Results:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

# ==========================
# 10. Validate
# ==========================
# Wind Speed
X_val_wind = val_df[feature_columns]
y_val_wind = val_df['Wind_Speed_next_hour']
pred_wind = model_wind.predict(X_val_wind)
error_percentage_wind=evaluate_regression(y_val_wind, pred_wind, label="Wind Speed Prediction")

# Thunderstorm
X_val_thunder = val_df[feature_columns]
y_val_thunder = val_df['Thunderstorm_next_hour']
pred_thunder = model_thunder.predict(X_val_thunder)
error_percentage_thunder=evaluate_classification(y_val_thunder, pred_thunder, label="Thunderstorm Prediction")



In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE only on training set
smote = SMOTE(random_state=42)
X_thunder_res, y_thunder_res = smote.fit_resample(X_thunder, y_thunder)

print("Before SMOTE:", np.bincount(y_thunder.astype(int)))
print("After SMOTE:", np.bincount(y_thunder_res.astype(int)))

# Train again
model_thunder = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

model_thunder.fit(X_thunder_res, y_thunder_res)


In [None]:
pred_thunder = model_thunder.predict(X_val_thunder)
evaluate_classification(y_val_thunder, pred_thunder, label="Thunderstorm Prediction (Balanced)")


In [None]:
from xgboost import XGBRegressor, XGBClassifier


In [None]:
# Train XGBoost Regressor
model_wind_xgb = XGBRegressor(
    n_estimators=500,      # number of boosting rounds
    learning_rate=0.05,    # shrinkage step
    max_depth=6,           # tree depth
    subsample=0.8,         # sample ratio per tree
    colsample_bytree=0.8,  # feature sampling per tree
    random_state=42,
    n_jobs=-1
)

model_wind_xgb.fit(X_wind, y_wind)

# Validation
pred_wind_xgb = model_wind_xgb.predict(X_val_wind)

evaluate_regression(y_val_wind, pred_wind_xgb, label="Wind Speed Prediction (XGBoost)")


In [None]:
# Handle imbalance with scale_pos_weight = (neg/pos)
scale_pos_weight = (y_thunder.value_counts()[0] / 
                    y_thunder.value_counts()[1])

model_thunder_xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,  # handle imbalance
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="logloss"
)

model_thunder_xgb.fit(X_thunder, y_thunder)

# Validation
pred_thunder_xgb = model_thunder_xgb.predict(X_val_thunder)

evaluate_classification(y_val_thunder, pred_thunder_xgb, label="Thunderstorm Prediction (XGBoost)")


In [None]:
import joblib

# Suppose your trained model is called:
# model_wind for wind speed, model_thunder for thunderstorm

# Save models
joblib.dump(model_wind_xgb, "wind_speed_1h_model.pkl")
joblib.dump(model_thunder_xgb, "thunderstorm_1h_model.pkl")

# Load models later
# model_wind_loaded = joblib.load("wind_speed_1h_model.pkl")
# model_thunder_loaded = joblib.load("thunderstorm_1h_model.pkl")


In [None]:
X_thunder.columns

In [None]:
export_data=pd.concat([val_df,test_df],ignore_index=True)
export_data.to_csv("test.csv",index=False)