In [40]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load and preprocess data
raw_df = pd.read_csv('weatherAUS.csv')
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

# Create training, validation, and test sets
year = pd.to_datetime(raw_df.Date).dt.year
train_df, val_df, test_df = raw_df[year < 2015], raw_df[year == 2015], raw_df[year > 2015]

# Create inputs and targets
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'
train_inputs, train_targets = train_df[input_cols], train_df[target_col]
val_inputs, val_targets = val_df[input_cols], val_df[target_col]
test_inputs, test_targets = test_df[input_cols], test_df[target_col]

# Identify numeric and categorical columns
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()[:-1]
categorical_cols = train_inputs.select_dtypes(include='object').columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline that includes preprocessing and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42))
])

# Train the model
model_pipeline.fit(train_inputs, train_targets)

# Helper function to predict, compute accuracy & plot confusion matrix
def predict_and_plot(inputs, targets, name=''):
    preds = model_pipeline.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print(f"Accuracy {name}: {round(accuracy * 100,2)}%")
    return preds

# Evaluate on validation and test set
val_preds = predict_and_plot(val_inputs, val_targets, 'Validation')
test_preds = predict_and_plot(test_inputs, test_targets, 'Test')

# Save the trained model
joblib.dump(model_pipeline, 'aussie_rain_pipeline.joblib')

Accuracy Validation: 84.1%
Accuracy Test: 82.79%


['aussie_rain_pipeline.joblib']

In [66]:
test_inputs.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,25590.0,25619.0,25710.0,10682.0,7989.0,24308.0,25600.0,24610.0,25457.0,23986.0,23056.0,23049.0,14579.0,12571.0,25650.0,24153.0
mean,12.851344,23.869402,2.405053,6.146855,7.670009,39.316151,13.718867,18.232751,70.045646,52.078046,1017.240081,1014.828097,4.950065,4.933418,17.489283,22.171958
std,6.476907,7.311422,8.381634,4.963576,3.776338,13.497042,8.640277,8.530014,18.661433,20.762575,7.098359,7.016125,2.886161,2.767427,6.590811,7.095891
min,-7.8,-4.8,0.0,0.0,0.0,7.0,0.0,0.0,4.0,3.0,982.0,977.1,0.0,0.0,-7.2,-5.4
25%,8.1,18.3,0.0,3.0,5.0,30.0,7.0,13.0,58.0,37.0,1012.6,1010.0,2.0,2.0,12.7,16.9
50%,12.7,23.4,0.0,5.4,8.5,37.0,13.0,17.0,71.0,53.0,1017.2,1014.9,6.0,6.0,17.2,21.8
75%,17.6,29.1,0.8,8.0,10.7,46.0,19.0,24.0,84.0,66.0,1021.8,1019.5,8.0,7.0,22.1,27.1
max,31.4,47.3,225.0,145.0,14.1,120.0,130.0,65.0,100.0,100.0,1040.3,1036.5,8.0,8.0,37.6,46.7


In [68]:
categorical_cols = train_inputs.select_dtypes(include='object').columns.tolist()

In [70]:
categorical_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [86]:
train_inputs.Location.unique()

array(['Albury', 'BadgerysCreek', 'Cobar', 'CoffsHarbour', 'Moree',
       'Newcastle', 'NorahHead', 'NorfolkIsland', 'Penrith', 'Richmond',
       'Sydney', 'SydneyAirport', 'WaggaWagga', 'Williamtown',
       'Wollongong', 'Canberra', 'Tuggeranong', 'MountGinini', 'Ballarat',
       'Bendigo', 'Sale', 'MelbourneAirport', 'Melbourne', 'Mildura',
       'Nhil', 'Portland', 'Watsonia', 'Dartmoor', 'Brisbane', 'Cairns',
       'GoldCoast', 'Townsville', 'Adelaide', 'MountGambier', 'Nuriootpa',
       'Woomera', 'Albany', 'Witchcliffe', 'PearceRAAF', 'PerthAirport',
       'Perth', 'SalmonGums', 'Walpole', 'Hobart', 'Launceston',
       'AliceSprings', 'Darwin', 'Katherine', 'Uluru'], dtype=object)