In [26]:
import pandas as pd
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [27]:
# Load raw dataset
cleaned_data_path = "../FC212025 udayanga/cleaned_data.csv"
df = pd.read_csv(cleaned_data_path)

 
# 🆕 Create a new binary target column: RainToday (1 = rain occurred, 0 = no rain)
df['RainToday'] = df['rain_sum'].apply(lambda x: 1 if x > 0 else 0)

# Show column names and first few rows
print("🧾 Raw Columns:", df.columns.tolist())
df.head()


🧾 Raw Columns: ['time', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'shortwave_radiation_sum', 'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'et0_fao_evapotranspiration', 'latitude', 'longitude', 'elevation', 'country', 'city', 'RainToday']


Unnamed: 0,time,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,...,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration,latitude,longitude,elevation,country,city,RainToday
0,1/1/2010,2,30.0,22.7,26.1,34.4,25.2,29.2,06:22:00,18:05:00,...,11.7,27.4,20,4.58,7.0,79.899994,16,Sri Lanka,Colombo,0
1,1/2/2010,51,29.9,23.5,26.2,33.8,26.2,29.8,06:22:00,18:06:00,...,13.0,27.0,24,3.84,7.0,79.899994,16,Sri Lanka,Colombo,1
2,1/3/2010,51,29.5,23.2,26.0,34.3,26.3,29.9,06:23:00,18:06:00,...,12.3,27.4,16,3.65,7.0,79.899994,16,Sri Lanka,Colombo,1
3,1/4/2010,2,28.9,21.9,25.3,31.6,23.4,27.8,06:23:00,18:07:00,...,17.0,34.6,356,3.79,7.0,79.899994,16,Sri Lanka,Colombo,0
4,1/5/2010,1,28.1,21.3,24.5,30.1,23.1,26.1,06:23:00,18:07:00,...,18.7,37.1,355,4.97,7.0,79.899994,16,Sri Lanka,Colombo,0


In [28]:
# 🧹 Basic cleaning: Drop rows with missing values
df = df.dropna()

# 🔄 (Optional) Convert categorical to numeric if needed
# Example: df['weather'] = df['weather'].map({'sunny': 0, 'rainy': 1})

# Example: If your target is 'RainTomorrow' with Yes/No
# df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# ✅ Print cleaned data info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 25 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   time                        147480 non-null  object 
 1   weathercode                 147480 non-null  int64  
 2   temperature_2m_max          147480 non-null  float64
 3   temperature_2m_min          147480 non-null  float64
 4   temperature_2m_mean         147480 non-null  float64
 5   apparent_temperature_max    147480 non-null  float64
 6   apparent_temperature_min    147480 non-null  float64
 7   apparent_temperature_mean   147480 non-null  float64
 8   sunrise                     147480 non-null  object 
 9   sunset                      147480 non-null  object 
 10  shortwave_radiation_sum     147480 non-null  float64
 11  precipitation_sum           147480 non-null  float64
 12  rain_sum                    147480 non-null  float64
 13  snowfall_sum  

In [29]:
# 🔁 Define target and features



# ✅ Now set it as the target column
target_column = "RainToday"


X = df.drop(columns=[target_column])
y = df[target_column]

# Split into train/test sets (optional but good practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Drop columns that are not useful for training or are not numeric
X = df.drop(columns=['RainToday', 'time', 'country', 'city', 'latitude', 'longitude'])

y = df['RainToday']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
# Drop non-numeric or non-useful columns
columns_to_drop = ['time', 'city', 'country', 'latitude', 'longitude', 'sunrise', 'sunset']

# Define X and y
X = df.drop(columns=columns_to_drop + ['RainToday'])  # drop target & useless cols
y = df['RainToday']

# Optional: Check if all columns in X are numeric
print("📊 Remaining features:", X.columns.tolist())
print("🧪 Data types:\n", X.dtypes)

# Proceed to train/test split and model training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.2%}")


📊 Remaining features: ['weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'shortwave_radiation_sum', 'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'et0_fao_evapotranspiration', 'elevation']
🧪 Data types:
 weathercode                     int64
temperature_2m_max            float64
temperature_2m_min            float64
temperature_2m_mean           float64
apparent_temperature_max      float64
apparent_temperature_min      float64
apparent_temperature_mean     float64
shortwave_radiation_sum       float64
precipitation_sum             float64
rain_sum                      float64
snowfall_sum                    int64
precipitation_hours             int64
windspeed_10m_max             float64
windgusts_10m_max             float64
winddirection_10m_dominant      int64
et0_fao_eva

In [35]:
# Save model to models/random_forest.pkl
model_output_path = "../../models/random_forest.pkl"
os.makedirs(os.path.dirname(model_output_path), exist_ok=True)

with open(model_output_path, "wb") as f:
    pickle.dump(model, f)

print(f"💾 Model saved to {model_output_path}")


💾 Model saved to ../../models/random_forest.pkl


In [33]:
# Predict on a few rows
sample = X_test.iloc[:5]
predictions = model.predict(sample)

print("🔮 Predictions for sample:", predictions)


🔮 Predictions for sample: [0 1 0 1 1]
