In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [9]:
# Load CSV
df = pd.read_csv("weatherstats_toronto_daily.csv")

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Print column names to confirm
print("Columns in CSV:", df.columns.tolist())

Columns in CSV: ['date', 'max_temperature', 'avg_hourly_temperature', 'avg_temperature', 'min_temperature', 'max_humidex', 'min_windchill', 'max_relative_humidity', 'avg_hourly_relative_humidity', 'avg_relative_humidity', 'min_relative_humidity', 'max_dew_point', 'avg_hourly_dew_point', 'avg_dew_point', 'min_dew_point', 'max_wind_speed', 'avg_hourly_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_wind_gust', 'wind_gust_dir_10s', 'max_pressure_sea', 'avg_hourly_pressure_sea', 'avg_pressure_sea', 'min_pressure_sea', 'max_pressure_station', 'avg_hourly_pressure_station', 'avg_pressure_station', 'min_pressure_station', 'max_visibility', 'avg_hourly_visibility', 'avg_visibility', 'min_visibility', 'max_health_index', 'avg_hourly_health_index', 'avg_health_index', 'min_health_index', 'heatdegdays', 'cooldegdays', 'growdegdays_5', 'growdegdays_7', 'growdegdays_10', 'precipitation', 'rain', 'snow', 'snow_on_ground', 'sunrise_hhmm', 'sunrise_unixtime', 'sunrise_f', 'sunset_hhmm', 'sunset_

  df = pd.read_csv("weatherstats_toronto_daily.csv")


In [6]:
# Example: create a target column for demo
# Replace this logic with your real categorization
if 'target' not in df.columns:
    df['target'] = np.random.choice([0,1,2], size=df.shape[0])

print(df['target'].value_counts())

target
1    10736
0    10704
2    10596
Name: count, dtype: int64


In [10]:
# Example: predict precipitation type
# 0 = no rain/snow, 1 = rain, 2 = snow
df['target'] = 0
df.loc[df['rain'] > 0, 'target'] = 1
df.loc[df['snow'] > 0, 'target'] = 2

print(df['target'].value_counts())

target
0    19624
1     8447
2     3965
Name: count, dtype: int64


In [11]:
# Check if 'count' or 'target' exists
if 'count' in df.columns:
    df['target'] = df['count']
elif 'target' in df.columns:
    df['target'] = df['target']
else:
    raise KeyError("No target column found. Make sure your CSV has 'count' or 'target' column.")

# Check distribution
print(df['target'].value_counts())

target
0    19624
1     8447
2     3965
Name: count, dtype: int64


In [12]:
# Columns to drop (dates, sunrise/sunset, forecasts)
drop_cols = [
    'date', 
    'sunrise_hhmm','sunrise_unixtime','sunrise_f', 
    'sunset_hhmm','sunset_unixtime','sunset_f'
]

# Only drop columns that exist
drop_cols = [c for c in drop_cols if c in df.columns]

# Features and target
X = df.drop(columns=drop_cols + ['target'])
y = df['target']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (32036, 65)
Target shape: (32036,)


In [13]:
# Fill numeric NaNs with median
X = X.fillna(X.median())

# Confirm no missing values remain
print("Missing values left:", X.isna().sum().sum())

Missing values left: 160180


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples: 25628
Testing samples: 6408


In [16]:
import numpy as np

print("NaN count:", np.isnan(X_train).sum().sum())
print("Inf count:", np.isinf(X_train).sum().sum())
print("Columns with NaN:")
print(X_train.columns[X_train.isnull().any()])

NaN count: 128140
Inf count: 0
Columns with NaN:
Index(['solar_radiation', 'max_cloud_cover_4', 'avg_hourly_cloud_cover_4',
       'avg_cloud_cover_4', 'min_cloud_cover_4'],
      dtype='object')


In [17]:
cols_to_drop = [
    'solar_radiation',
    'max_cloud_cover_4',
    'avg_hourly_cloud_cover_4',
    'avg_cloud_cover_4',
    'min_cloud_cover_4'
]

X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)

In [18]:
import numpy as np

print("NaN count:", np.isnan(X_train).sum().sum())
print("Inf count:", np.isinf(X_train).sum().sum())
print("Columns with NaN:")
print(X_train.columns[X_train.isnull().any()])

NaN count: 0
Inf count: 0
Columns with NaN:
Index([], dtype='object')


In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
leakage_cols = ['rain', 'snow', 'precipitation']
leakage_cols = [c for c in leakage_cols if c in X_train.columns]

X_train = X_train.drop(columns=leakage_cols)
X_test = X_test.drop(columns=leakage_cols)

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model.fit(X_train_scaled, y_train)

In [23]:
y_pred = model.predict(X_test_scaled)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n0 = Dry 1 = Rain 2 = Snow\n", classification_report(y_test, y_pred))

Accuracy: 0.7624843945068664

Confusion Matrix:
 [[3255  442  228]
 [ 569 1063   58]
 [ 183   42  568]]

Classification Report:
0 = Dry 1 = Rain 2 = Snow
               precision    recall  f1-score   support

           0       0.81      0.83      0.82      3925
           1       0.69      0.63      0.66      1690
           2       0.67      0.72      0.69       793

    accuracy                           0.76      6408
   macro avg       0.72      0.72      0.72      6408
weighted avg       0.76      0.76      0.76      6408

