# Project Weather Analysis & Forecast - Bengaluru

## Historical Data (2010-2021)

In [None]:
# ============================
# 📌 1️⃣ Imports
# ============================
import pandas as pd
import requests
from google.colab import files

# ============================
# 📌 2️⃣ Daily fetch
# ============================
latitude = 12.9716
longitude = 77.5946

start_date = "2010-01-01"
end_date = "2021-12-31"

daily_vars = ",".join([
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "precipitation_sum",
    "windspeed_10m_max",
    "windgusts_10m_max",
    "weathercode",
    "shortwave_radiation_sum"
])

url_daily = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&daily={daily_vars}&timezone=auto"
)

response_daily = requests.get(url_daily)
daily_data = response_daily.json()
daily_df = pd.DataFrame(daily_data['daily'])
daily_df.rename(columns={'time': 'date'}, inplace=True)

print("✅ Daily weather data shape:", daily_df.shape)

# ============================
# 📌 3️⃣ Hourly fetch + mean
# ============================
hourly_vars = ",".join([
    "cloudcover",
    "relativehumidity_2m"
])

url_hourly = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&hourly={hourly_vars}&timezone=auto"
)

response_hourly = requests.get(url_hourly)
hourly_data = response_hourly.json()
hourly_df = pd.DataFrame(hourly_data['hourly'])
hourly_df['time'] = pd.to_datetime(hourly_df['time'])
hourly_df['date'] = hourly_df['time'].dt.date

# Group to daily mean
hourly_daily_mean = hourly_df.groupby('date').agg({
    'cloudcover': 'mean',
    'relativehumidity_2m': 'mean'
}).reset_index()

hourly_daily_mean.rename(columns={
    'cloudcover': 'cloudcover_mean',
    'relativehumidity_2m': 'humidity_2m_mean'
}, inplace=True)

print("✅ Hourly aggregated shape:", hourly_daily_mean.shape)

# ============================
# 📌 4️⃣ Merge daily + hourly mean
# ============================
merged = daily_df.merge(hourly_daily_mean, how='left', on='date')

# ============================
# 📌 5️⃣ Map WeatherSimple
# ============================
def map_weather_simple(code):
    if code in [0, 1, 2, 3]:
        return 0  # Clear/partly cloudy
    elif code in [51, 53, 55]:
        return 1  # Drizzle
    elif code in [61, 63, 65]:
        return 2  # Rain
    else:
        return 2  # Fallback Rain

merged['WeatherSimple'] = merged['weathercode'].apply(map_weather_simple)

# ============================
# 📌 6️⃣ Final columns order
# ============================
final_cols = [
    'date',
    'temperature_2m_min',
    'temperature_2m_max',
    'temperature_2m_mean',
    'windspeed_10m_max',
    'windgusts_10m_max',
    'precipitation_sum',
    'shortwave_radiation_sum',
    'cloudcover_mean',
    'humidity_2m_mean',
    'weathercode',
    'WeatherSimple'
]

merged = merged[final_cols]

print("✅ Final shape:", merged.shape)
print(merged.head())

# ============================
# 📌 7️⃣ Save & Download
# ============================
merged.to_csv("bangalore_weather_2010_2021_final.csv", index=False)
files.download("bangalore_weather_2010_2021_final.csv")

print("✅ Done! Final historical with WeatherSimple ready for Power BI 🚀")


✅ Daily weather data shape: (4383, 9)
✅ Hourly aggregated shape: (4383, 3)
✅ Final shape: (4383, 12)
         date  temperature_2m_min  temperature_2m_max  temperature_2m_mean  \
0  2010-01-01                16.7                26.3                 20.8   
1  2010-01-02                15.4                25.9                 20.1   
2  2010-01-03                16.2                25.4                 19.7   
3  2010-01-04                13.8                26.8                 19.6   
4  2010-01-05                13.2                27.4                 20.1   

   windspeed_10m_max  windgusts_10m_max  precipitation_sum  \
0               15.0               34.6                0.0   
1               13.0               31.0                0.0   
2               10.2               23.4                0.0   
3               12.1               29.5                0.0   
4               13.8               28.8                0.0   

   shortwave_radiation_sum  cloudcover_mean  humidity_2m_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! Final historical with WeatherSimple ready for Power BI 🚀


## Actual Data fetch 2022-2025 Jan

In [None]:
# ============================
# 📌 1️⃣ Imports
# ============================
import pandas as pd
import requests
from google.colab import files

# ============================
# 📌 2️⃣ Daily fetch
# ============================
latitude = 12.9716
longitude = 77.5946

start_date = "2022-01-01"
end_date = "2025-01-31"

daily_vars = ",".join([
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "precipitation_sum",
    "windspeed_10m_max",
    "windgusts_10m_max",
    "weathercode",
    "shortwave_radiation_sum"
])

url_daily = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&daily={daily_vars}&timezone=auto"
)

response_daily = requests.get(url_daily)
daily_data = response_daily.json()
daily_df = pd.DataFrame(daily_data['daily'])
daily_df.rename(columns={'time': 'date'}, inplace=True)

print("✅ Daily weather data shape:", daily_df.shape)

# ============================
# 📌 3️⃣ Hourly fetch + mean
# ============================
hourly_vars = ",".join([
    "cloudcover",
    "relativehumidity_2m"
])

url_hourly = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&hourly={hourly_vars}&timezone=auto"
)

response_hourly = requests.get(url_hourly)
hourly_data = response_hourly.json()
hourly_df = pd.DataFrame(hourly_data['hourly'])
hourly_df['time'] = pd.to_datetime(hourly_df['time'])
hourly_df['date'] = hourly_df['time'].dt.date

# Group to daily mean
hourly_daily_mean = hourly_df.groupby('date').agg({
    'cloudcover': 'mean',
    'relativehumidity_2m': 'mean'
}).reset_index()

hourly_daily_mean.rename(columns={
    'cloudcover': 'cloudcover_mean',
    'relativehumidity_2m': 'humidity_2m_mean'
}, inplace=True)

print("✅ Hourly aggregated shape:", hourly_daily_mean.shape)

# ============================
# 📌 4️⃣ Merge daily + hourly mean
# ============================
merged = daily_df.merge(hourly_daily_mean, how='left', on='date')

# ============================
# 📌 5️⃣ Map WeatherSimple
# ============================
def map_weather_simple(code):
    if code in [0, 1, 2, 3]:
        return 0  # Clear/partly cloudy
    elif code in [51, 53, 55]:
        return 1  # Drizzle
    elif code in [61, 63, 65]:
        return 2  # Rain
    else:
        return 2  # Fallback Rain

merged['WeatherSimple'] = merged['weathercode'].apply(map_weather_simple)

# ============================
# 📌 6️⃣ Final columns order
# ============================
final_cols = [
    'date',
    'temperature_2m_min',
    'temperature_2m_max',
    'temperature_2m_mean',
    'windspeed_10m_max',
    'windgusts_10m_max',
    'precipitation_sum',
    'shortwave_radiation_sum',
    'cloudcover_mean',
    'humidity_2m_mean',
    'weathercode',
    'WeatherSimple'
]

merged = merged[final_cols]

print("✅ Final shape:", merged.shape)
print(merged.head())

# ============================
# 📌 7️⃣ Save & Download
# ============================
merged.to_csv("bangalore_weather_2022_2025_final.csv", index=False)
files.download("bangalore_weather_2022_2025_final.csv")

print("✅ Done! Final historical with WeatherSimple ready for Power BI 🚀")


✅ Daily weather data shape: (1127, 9)
✅ Hourly aggregated shape: (1127, 3)
✅ Final shape: (1127, 12)
         date  temperature_2m_min  temperature_2m_max  temperature_2m_mean  \
0  2022-01-01                18.1                25.8                 20.8   
1  2022-01-02                16.1                27.0                 20.7   
2  2022-01-03                15.6                26.9                 20.4   
3  2022-01-04                16.3                26.4                 20.3   
4  2022-01-05                14.8                26.4                 20.2   

   windspeed_10m_max  windgusts_10m_max  precipitation_sum  \
0               22.1               40.3                2.3   
1               18.8               37.1                0.0   
2               16.6               29.5                0.0   
3               16.2               30.6                0.0   
4               15.9               31.3                0.0   

   shortwave_radiation_sum  cloudcover_mean  humidity_2m_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! Final historical with WeatherSimple ready for Power BI 🚀


## Training & Prediction 2022-2025 Jan

In [None]:
# ================================================
# 📌 Bangalore Weather Prediction Script
# - Predicts 2022-01-01 to 2025-01-31
# ================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from google.colab import files

# ================================================
# 1️⃣ Upload final cleaned historical data
# ================================================
uploaded = files.upload()
df = pd.read_csv('bangalore_weather_2010_2021_final.csv')
print(df.info())

# ================================================
# 2️⃣ Feature engineering
# ================================================
df['date'] = pd.to_datetime(df['date'])
df['dayofyear'] = df['date'].dt.dayofyear
df['year'] = df['date'].dt.year

features = ['dayofyear', 'year']

xgb_targets = [
    "temperature_2m_min",
    "temperature_2m_max",
    "temperature_2m_mean"
]

rf_targets = [
    "windspeed_10m_max",
    "windgusts_10m_max",
    "shortwave_radiation_sum",
    "precipitation_sum",
    "cloudcover_mean",
    "humidity_2m_mean"
]

train_df = df[df['year'] < 2022]
predict_dates = pd.date_range(start='2022-01-01', end='2025-01-31', freq='D')
predict_df = pd.DataFrame({'date': predict_dates})
predict_df['dayofyear'] = predict_df['date'].dt.dayofyear
predict_df['year'] = predict_df['date'].dt.year

predictions = {'date': predict_df['date']}

# ================================================
# 3️⃣ XGBoost regression targets
# ================================================
for target in xgb_targets:
    X_train = train_df[features]
    y_train = train_df[target]

    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(predict_df[features])
    predictions[target] = preds

    train_preds = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    r2 = r2_score(y_train, train_preds)
    print(f'🎯 Target: {target} (XGBoost)\n   ✅ RMSE: {rmse:.2f}\n   ✅ R²: {r2:.2f}')

# ================================================
# 4️⃣ Random Forest regression targets
# ================================================
for target in rf_targets:
    rf_features = features + [
        "temperature_2m_min", "temperature_2m_max", "temperature_2m_mean"
    ]

    X_train_rf = train_df[rf_features]
    y_train_rf = train_df[target]

    # Drop any NaNs so RF doesn't fail
    train_valid = X_train_rf.copy()
    train_valid['y'] = y_train_rf
    train_valid = train_valid.dropna()

    if len(train_valid) == 0:
        print(f"⚠️ Skipping target {target} (RF) — no valid rows after dropna.")
        continue

    X_train_rf = train_valid[rf_features]
    y_train_rf = train_valid['y']

    X_future_rf = predict_df.copy()
    for t in xgb_targets:
        X_future_rf[t] = predictions[t]
    X_future_rf = X_future_rf[rf_features]

    model_rf = RandomForestRegressor(n_estimators=200, random_state=42)
    model_rf.fit(X_train_rf, y_train_rf)

    preds_rf = model_rf.predict(X_future_rf)
    predictions[target] = preds_rf

    train_preds_rf = model_rf.predict(X_train_rf)
    rmse = np.sqrt(mean_squared_error(y_train_rf, train_preds_rf))
    r2 = r2_score(y_train_rf, train_preds_rf)
    print(f'🎯 Target: {target} (RF)\n   ✅ RMSE: {rmse:.2f}\n   ✅ R²: {r2:.2f}')

# ================================================
# 5️⃣ WeatherSimple classifier
# ================================================
clf_features = features + [
    "temperature_2m_max",
    "windgusts_10m_max",
    "shortwave_radiation_sum"
]

# Add only if available
if 'cloudcover_mean' in predictions:
    clf_features.append("cloudcover_mean")
if 'humidity_2m_mean' in predictions:
    clf_features.append("humidity_2m_mean")

# Build training target
def map_weather_simple(code):
    if code in [0, 1, 2, 3]:
        return 0  # Clear/partly cloudy
    elif code in [51, 53, 55]:
        return 1  # Drizzle
    elif code in [61, 63, 65]:
        return 2  # Rain
    else:
        return 2  # Fallback to Rain

train_df['WeatherSimple'] = train_df['weathercode'].apply(map_weather_simple)

y_cls = train_df['WeatherSimple']
X_cls = train_df[clf_features]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)

clf = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
clf.fit(X_train_cls, y_train_cls)

y_pred_cls = clf.predict(X_test_cls)
print("🎯 WeatherSimple classifier results:")
print("   ✅ Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# Predict future
for t in xgb_targets + rf_targets:
    if t in predictions:
        predict_df[t] = predictions[t]

y_future_pred = clf.predict(predict_df[clf_features])
predictions['WeatherSimple'] = y_future_pred

# ================================================
# 6️⃣ Save final predictions for Power BI
# ================================================
final_columns = [
    'date',
    'temperature_2m_min',
    'temperature_2m_max',
    'temperature_2m_mean',
    'windspeed_10m_max',
    'windgusts_10m_max',
    'shortwave_radiation_sum',
    'precipitation_sum',
    'WeatherSimple'
]

if 'cloudcover_mean' in predictions:
    final_columns.insert(-1, 'cloudcover_mean')
if 'humidity_2m_mean' in predictions:
    final_columns.insert(-1, 'humidity_2m_mean')

predicted_df = pd.DataFrame({col: predictions[col] for col in final_columns})
predicted_df.to_csv('/content/bangalore_weather_predicted_2022_2025.csv', index=False)
print('✅ Saved predictions to bangalore_weather_predicted_2022_2025.csv')

files.download("bangalore_weather_predicted_2022_2025.csv")

Saving bangalore_weather_2010_2021_final.csv to bangalore_weather_2010_2021_final (6).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4383 entries, 0 to 4382
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     4383 non-null   object 
 1   temperature_2m_min       4383 non-null   float64
 2   temperature_2m_max       4383 non-null   float64
 3   temperature_2m_mean      4383 non-null   float64
 4   windspeed_10m_max        4383 non-null   float64
 5   windgusts_10m_max        4383 non-null   float64
 6   precipitation_sum        4383 non-null   float64
 7   shortwave_radiation_sum  4383 non-null   float64
 8   cloudcover_mean          0 non-null      float64
 9   humidity_2m_mean         0 non-null      float64
 10  weathercode              4383 non-null   int64  
 11  WeatherSimple            4383 non-null   int64  
dtypes: float64(9), int64(2), object(1)
memory 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Phase 2 - today +6days

### Historical Training Data 2015 - 2025-07-31

In [None]:
# ============================
# 📌 1️⃣ Imports
# ============================
import pandas as pd
import requests
from google.colab import files
from datetime import date, timedelta

# ============================
# 📌 2️⃣ Daily fetch
# ============================
latitude = 12.9716
longitude = 77.5946

start_date = "2012-01-01"
end_date = "2023-12-31"

daily_vars = ",".join([
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "precipitation_sum",
    "windspeed_10m_max",
    "windgusts_10m_max",
    "weathercode",
    "shortwave_radiation_sum"
])

url_daily = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&daily={daily_vars}&timezone=auto"
)

response_daily = requests.get(url_daily)
daily_data = response_daily.json()
daily_df = pd.DataFrame(daily_data['daily'])
daily_df.rename(columns={'time': 'date'}, inplace=True)

print("✅ Daily weather data shape:", daily_df.shape)

# ============================
# 📌 3️⃣ Hourly fetch + mean
# ============================
hourly_vars = ",".join([
    "cloudcover",
    "relativehumidity_2m"
])

url_hourly = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={start_date}&end_date={end_date}"
    f"&hourly={hourly_vars}&timezone=auto"
)

response_hourly = requests.get(url_hourly)
hourly_data = response_hourly.json()
hourly_df = pd.DataFrame(hourly_data['hourly'])
hourly_df['time'] = pd.to_datetime(hourly_df['time'])
hourly_df['date'] = hourly_df['time'].dt.date

# Group to daily mean
hourly_daily_mean = hourly_df.groupby('date').agg({
    'cloudcover': 'mean',
    'relativehumidity_2m': 'mean'
}).reset_index()

hourly_daily_mean.rename(columns={
    'cloudcover': 'cloudcover_mean',
    'relativehumidity_2m': 'humidity_2m_mean'
}, inplace=True)

print("✅ Hourly aggregated shape:", hourly_daily_mean.shape)

# ============================
# 📌 4️⃣ Merge daily + hourly mean
# ============================
merged = daily_df.merge(hourly_daily_mean, how='left', on='date')

# ============================
# 📌 5️⃣ Map WeatherSimple
# ============================
def map_weather_simple(code):
    if code in [0, 1, 2, 3]:
        return 0  # Clear/partly cloudy
    elif code in [51, 53, 55]:
        return 1  # Drizzle
    elif code in [61, 63, 65]:
        return 2  # Rain
    else:
        return 2  # Fallback Rain

merged['WeatherSimple'] = merged['weathercode'].apply(map_weather_simple)

# ============================
# 📌 6️⃣ Final columns order
# ============================
final_cols = [
    'date',
    'temperature_2m_min',
    'temperature_2m_max',
    'temperature_2m_mean',
    'windspeed_10m_max',
    'windgusts_10m_max',
    'precipitation_sum',
    'shortwave_radiation_sum',
    'cloudcover_mean',
    'humidity_2m_mean',
    'weathercode',
    'WeatherSimple'
]

merged = merged[final_cols]

print("✅ Final shape:", merged.shape)
print(merged.head())

# ============================
# 📌 7️⃣ Save & Download
# ============================

file_name = f"bangalore_weather_2010_{end_date}_final.csv"
merged.to_csv(file_name, index=False)
files.download(file_name)

print("✅ Done! Final historical with WeatherSimple ready for Power BI 🚀")


✅ Daily weather data shape: (4383, 9)
✅ Hourly aggregated shape: (4383, 3)
✅ Final shape: (4383, 12)
         date  temperature_2m_min  temperature_2m_max  temperature_2m_mean  \
0  2012-01-01                16.1                26.2                 20.4   
1  2012-01-02                17.6                26.9                 21.2   
2  2012-01-03                16.5                27.7                 21.2   
3  2012-01-04                18.2                28.1                 22.2   
4  2012-01-05                18.2                28.2                 22.0   

   windspeed_10m_max  windgusts_10m_max  precipitation_sum  \
0               17.0               40.3                0.0   
1               11.0               29.5                0.0   
2               11.9               23.8                0.0   
3               13.3               28.8                0.0   
4               11.9               21.6                0.0   

   shortwave_radiation_sum  cloudcover_mean  humidity_2m_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! Final historical with WeatherSimple ready for Power BI 🚀


### Training & Prediction - today +6 Days Everyday

In [None]:
# ================================================
# 📌 Bangalore Weather Prediction Script
# - Predicts today +6 days everyday
# ================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from google.colab import files
from datetime import datetime, timedelta

# ================================================
# 1️⃣ Upload final cleaned historical data
# ================================================
uploaded = files.upload()
train_df = pd.read_csv('bangalore_weather_2010_2023-12-31_final.csv')
print(train_df.info())

# ================================================
# 2️⃣ Feature engineering
# ================================================
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['dayofyear'] = train_df['date'].dt.dayofyear
train_df['year'] = train_df['date'].dt.year

features = ['dayofyear', 'year']

xgb_targets = [
    "temperature_2m_min",
    "temperature_2m_max",
    "temperature_2m_mean"
]

rf_targets = [
    "windspeed_10m_max",
    "windgusts_10m_max",
    "shortwave_radiation_sum",
    "precipitation_sum",
    "cloudcover_mean",
    "humidity_2m_mean"
]


today = datetime.today().date()
predict_dates = pd.date_range(
    start=today,
    end=today + timedelta(days=6),
    freq='D'
)
predict_df = pd.DataFrame({'date': predict_dates})
predict_df['dayofyear'] = predict_df['date'].dt.dayofyear
predict_df['year'] = predict_df['date'].dt.year

predictions = {'date': predict_df['date']}

# ================================================
# 3️⃣ Random Forest regression targets
# ================================================
for target in xgb_targets:
    X_train = train_df[features]
    y_train = train_df[target]

    model = RandomForestRegressor(
        n_estimators=500,       # More trees for stability
        max_depth=None,         # Let trees grow fully
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    )
    model.fit(X_train, y_train)

    preds = model.predict(predict_df[features])
    predictions[target] = preds

    train_preds = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    r2 = r2_score(y_train, train_preds)
    print(f'🎯 Target: {target} (RandomForest)\n   ✅ RMSE: {rmse:.2f}\n   ✅ R²: {r2:.2f}')

# ================================================
# 4️⃣ Random Forest regression targets
# ================================================
for target in rf_targets:
    rf_features = features + [
        "temperature_2m_min", "temperature_2m_max", "temperature_2m_mean"
    ]

    X_train_rf = train_df[rf_features]
    y_train_rf = train_df[target]

    # Drop any NaNs so RF doesn't fail
    train_valid = X_train_rf.copy()
    train_valid['y'] = y_train_rf
    train_valid = train_valid.dropna()

    if len(train_valid) == 0:
        print(f"⚠️ Skipping target {target} (RF) — no valid rows after dropna.")
        continue

    X_train_rf = train_valid[rf_features]
    y_train_rf = train_valid['y']

    X_future_rf = predict_df.copy()
    for t in xgb_targets:
        X_future_rf[t] = predictions[t]
    X_future_rf = X_future_rf[rf_features]

    model_rf = RandomForestRegressor(n_estimators=200, random_state=42)
    model_rf.fit(X_train_rf, y_train_rf)

    preds_rf = model_rf.predict(X_future_rf)
    predictions[target] = preds_rf

    train_preds_rf = model_rf.predict(X_train_rf)
    rmse = np.sqrt(mean_squared_error(y_train_rf, train_preds_rf))
    r2 = r2_score(y_train_rf, train_preds_rf)
    print(f'🎯 Target: {target} (RF)\n   ✅ RMSE: {rmse:.2f}\n   ✅ R²: {r2:.2f}')

# ================================================
# 5️⃣ WeatherSimple classifier
# ================================================
clf_features = features + [
    "temperature_2m_max",
    "windgusts_10m_max",
    "shortwave_radiation_sum"
]

# Add only if available
if 'cloudcover_mean' in predictions:
    clf_features.append("cloudcover_mean")
if 'humidity_2m_mean' in predictions:
    clf_features.append("humidity_2m_mean")

# Build training target
def map_weather_simple(code):
    if code in [0, 1, 2, 3]:
        return 0  # Clear/partly cloudy
    elif code in [51, 53, 55]:
        return 1  # Drizzle
    elif code in [61, 63, 65]:
        return 2  # Rain
    else:
        return 2  # Fallback to Rain

train_df['WeatherSimple'] = train_df['weathercode'].apply(map_weather_simple)

y_cls = train_df['WeatherSimple']
X_cls = train_df[clf_features]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)

clf = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
clf.fit(X_train_cls, y_train_cls)

y_pred_cls = clf.predict(X_test_cls)
print("🎯 WeatherSimple classifier results:")
print("   ✅ Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# Predict future
for t in xgb_targets + rf_targets:
    if t in predictions:
        predict_df[t] = predictions[t]

y_future_pred = clf.predict(predict_df[clf_features])
predictions['WeatherSimple'] = y_future_pred

# ================================================
# 6️⃣ Save final predictions for Power BI
# ================================================
final_columns = [
    'date',
    'temperature_2m_min',
    'temperature_2m_max',
    'temperature_2m_mean',
    'windspeed_10m_max',
    'windgusts_10m_max',
    'shortwave_radiation_sum',
    'precipitation_sum',
    'WeatherSimple'
]

if 'cloudcover_mean' in predictions:
    final_columns.insert(-1, 'cloudcover_mean')
if 'humidity_2m_mean' in predictions:
    final_columns.insert(-1, 'humidity_2m_mean')

predicted_df = pd.DataFrame({col: predictions[col] for col in final_columns})
predicted_df.to_csv('/content/bangalore_weather_predicted_2025.csv', index=False)
print('✅ Saved predictions to bangalore_weather_predicted_2025.csv')

files.download("bangalore_weather_predicted_2025.csv")

Saving bangalore_weather_2010_2025-07-31_final.csv to bangalore_weather_2010_2025-07-31_final (5).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4383 entries, 0 to 4382
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     4383 non-null   object 
 1   temperature_2m_min       4383 non-null   float64
 2   temperature_2m_max       4383 non-null   float64
 3   temperature_2m_mean      4383 non-null   float64
 4   windspeed_10m_max        4383 non-null   float64
 5   windgusts_10m_max        4383 non-null   float64
 6   precipitation_sum        4383 non-null   float64
 7   shortwave_radiation_sum  4383 non-null   float64
 8   cloudcover_mean          0 non-null      float64
 9   humidity_2m_mean         0 non-null      float64
 10  weathercode              4383 non-null   int64  
 11  WeatherSimple            4383 non-null   int64  
dtypes: float64(9), int64(2), objec

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>