In [25]:
import pandas as pd
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [26]:
# Load raw dataset
cleaned_data_path = "../../data/cleaned_weather.csv"
df = pd.read_csv(cleaned_data_path)

 

# 🆕 Create a new binary target column: RainToday (1 = rain occurred, 0 = no rain)
df['RainToday'] = df['rain_sum (mm)'].apply(lambda x: 1 if x > 0 else 0)

# Show column names and first few rows
print("🧾 Raw Columns:", df.columns.tolist())
df.head()


🧾 Raw Columns: ['location_id', 'weather_code (wmo code)', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'temperature_2m_mean (°C)', 'apparent_temperature_max (°C)', 'apparent_temperature_min (°C)', 'apparent_temperature_mean (°C)', 'daylight_duration (s)', 'sunshine_duration (s)', 'precipitation_sum (mm)', 'rain_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_gusts_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'et0_fao_evapotranspiration (mm)', 'latitude', 'longitude', 'elevation', 'year', 'month', 'day', 'sunrise_hour', 'sunset_hour', 'daylight_hours', 'RainToday']


Unnamed: 0,location_id,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),apparent_temperature_mean (°C),daylight_duration (s),sunshine_duration (s),...,latitude,longitude,elevation,year,month,day,sunrise_hour,sunset_hour,daylight_hours,RainToday
0,0,1,30.1,22.6,26.0,34.5,25.0,29.0,42220.2,38905.73,...,6.924429,79.90725,4,2010,1,1,6,18,12,0
1,0,51,30.1,23.7,26.3,33.9,26.1,29.7,42225.71,37451.01,...,6.924429,79.90725,4,2010,2,1,6,18,12,1
2,0,51,29.6,23.1,26.0,34.5,26.2,29.9,42231.68,33176.43,...,6.924429,79.90725,4,2010,3,1,6,18,12,1
3,0,2,28.9,23.1,25.7,31.7,26.1,28.4,42238.11,38289.2,...,6.924429,79.90725,4,2010,4,1,6,18,12,0
4,0,1,28.1,21.3,24.6,30.0,22.9,26.2,42244.99,39113.82,...,6.924429,79.90725,4,2010,5,1,6,18,12,0


In [27]:
# 🧹 Basic cleaning: Drop rows with missing values
df = df.dropna()

# 🔄 (Optional) Convert categorical to numeric if needed
# Example: df['weather'] = df['weather'].map({'sunny': 0, 'rainy': 1})

# Example: If your target is 'RainTomorrow' with Yes/No
# df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# ✅ Print cleaned data info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142371 entries, 0 to 142370
Data columns (total 28 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   location_id                      142371 non-null  int64  
 1   weather_code (wmo code)          142371 non-null  int64  
 2   temperature_2m_max (°C)          142371 non-null  float64
 3   temperature_2m_min (°C)          142371 non-null  float64
 4   temperature_2m_mean (°C)         142371 non-null  float64
 5   apparent_temperature_max (°C)    142371 non-null  float64
 6   apparent_temperature_min (°C)    142371 non-null  float64
 7   apparent_temperature_mean (°C)   142371 non-null  float64
 8   daylight_duration (s)            142371 non-null  float64
 9   sunshine_duration (s)            142371 non-null  float64
 10  precipitation_sum (mm)           142371 non-null  float64
 11  rain_sum (mm)                    142371 non-null  float64
 12  pr

In [28]:
# 🔁 Define target and features



# ✅ Now set it as the target column
target_column = "RainToday"


X = df.drop(columns=[target_column])
y = df[target_column]

# Split into train/test sets (optional but good practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
y = df['RainToday']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Drop non-numeric or non-useful columns
columns_to_drop = ['time', 'city', 'country', 'latitude', 'longitude', 'sunrise', 'sunset']



# Optional: Check if all columns in X are numeric
print("📊 Remaining features:", X.columns.tolist())
print("🧪 Data types:\n", X.dtypes)

# Proceed to train/test split and model training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.2%}")


📊 Remaining features: ['location_id', 'weather_code (wmo code)', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'temperature_2m_mean (°C)', 'apparent_temperature_max (°C)', 'apparent_temperature_min (°C)', 'apparent_temperature_mean (°C)', 'daylight_duration (s)', 'sunshine_duration (s)', 'precipitation_sum (mm)', 'rain_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_gusts_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'et0_fao_evapotranspiration (mm)', 'latitude', 'longitude', 'elevation', 'year', 'month', 'day', 'sunrise_hour', 'sunset_hour', 'daylight_hours']
🧪 Data types:
 location_id                          int64
weather_code (wmo code)              int64
temperature_2m_max (°C)            float64
temperature_2m_min (°C)            float64
temperature_2m_mean (°C)           float64
apparent_temperature_max (°C)      float64
apparent_temperature_min (°C)      float64
apparent_temperature_mean (°C)     float64


In [31]:
# Save model to models/random_forest.pkl
model_output_path = "../../models/random_forest.pkl"
os.makedirs(os.path.dirname(model_output_path), exist_ok=True)

with open(model_output_path, "wb") as f:
    pickle.dump(model, f)

print(f"💾 Model saved to {model_output_path}")


💾 Model saved to ../../models/random_forest.pkl


In [32]:
# Predict on a few rows
sample = X_test.iloc[:5]
predictions = model.predict(sample)

print("🔮 Predictions for sample:", predictions)


🔮 Predictions for sample: [0 1 1 1 1]
