### Building a Rainfall Prediction Classifier

In [4]:
#import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns


### Load the data set

In [5]:
url="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/_0eYOqji3unP1tDNKWZMjg/weatherAUS-2.csv"
df= pd.read_csv(url)
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [7]:
df.count()

Date             145460
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        142199
RainTomorrow     142193
dtype: int64

### Drop all rows with the missing values


In [8]:
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56420 entries, 6049 to 142302
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           56420 non-null  object 
 1   Location       56420 non-null  object 
 2   MinTemp        56420 non-null  float64
 3   MaxTemp        56420 non-null  float64
 4   Rainfall       56420 non-null  float64
 5   Evaporation    56420 non-null  float64
 6   Sunshine       56420 non-null  float64
 7   WindGustDir    56420 non-null  object 
 8   WindGustSpeed  56420 non-null  float64
 9   WindDir9am     56420 non-null  object 
 10  WindDir3pm     56420 non-null  object 
 11  WindSpeed9am   56420 non-null  float64
 12  WindSpeed3pm   56420 non-null  float64
 13  Humidity9am    56420 non-null  float64
 14  Humidity3pm    56420 non-null  float64
 15  Pressure9am    56420 non-null  float64
 16  Pressure3pm    56420 non-null  float64
 17  Cloud9am       56420 non-null  float64
 18  Cl

In [9]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

### Features that would be innefficirnt for oredictig tomorrow's rainfall

In [45]:
# Create a dictionary with the data
inneficient_data = {
    "Feature": ["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm", "Pressure9am", "Pressure3pm"],
    "Non-null Count": [82670, 75625, 89572, 86102, 130395, 130432],
    "Comment": [
        "Over 40% missing — inefficient",
        "Nearly 50% missing — inefficient",
        "Over 38% missing — potentially inefficient",
        "Over 41% missing — potentially inefficient",
        "~15,000 missing — depends on context",
        "~15,000 missing — depends on context"
    ]
}

# Convert the dictionary into a DataFrame
inneficient_data_table = pd.DataFrame(inneficient_data)

# Display the DataFrame
inneficient_data_table 

Unnamed: 0,Feature,Non-null Count,Comment
0,Evaporation,82670,Over 40% missing — inefficient
1,Sunshine,75625,Nearly 50% missing — inefficient
2,Cloud9am,89572,Over 38% missing — potentially inefficient
3,Cloud3pm,86102,Over 41% missing — potentially inefficient
4,Pressure9am,130395,"~15,000 missing — depends on context"
5,Pressure3pm,130432,"~15,000 missing — depends on context"


### Create a function to map dates to seasons

In [None]:
def date_to_season(date):
    month= date.month
    if (month==12) or (month==1) or (month==2):
        return 'Summer'
    elif (month == 3) or (month == 4) or (month == 5):
        return 'Autumn'
    elif (month == 6) or (month == 7) or (month == 8):
        return 'Winter'
    elif (month == 9) or (month == 10) or (month == 11):
        return 'Spring'

In [49]:
print(df.columns.tolist())


['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainYesterday', 'RainYesterday']


### Define the feature and target dataframes


### Drop the confusiion in the rain column naming

In [52]:
# Import required library
import pandas as pd

# Load the dataset
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/_0eYOqji3unP1tDNKWZMjg/weatherAUS-2.csv"
df = pd.read_csv(url)

# Drop rows with missing values
df = df.dropna()

# Fix the rain column naming confusion
# Original columns: 'RainToday' (current day), 'RainTomorrow' (next day)
# We'll rename to make it absolutely clear:
df = df.rename(columns={
    'RainToday': 'RainCurrentDay',  # Whether it rained on the recorded date
    'RainTomorrow': 'RainNextDay'   # Whether it rained the following day (target)
})

# Verify our columns
print("Current columns in dataset:")
print(df.columns.tolist())

# Show the distribution of our target variable (RainNextDay)
print("\nClass distribution for RainNextDay (target variable):")
print(df['RainNextDay'].value_counts())

# Show percentages
print("\nClass percentages:")
print(df['RainNextDay'].value_counts(normalize=True).apply(lambda x: f"{x*100:.1f}%"))

Current columns in dataset:
['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainCurrentDay', 'RainNextDay']

Class distribution for RainNextDay (target variable):
No     43993
Yes    12427
Name: RainNextDay, dtype: int64

Class percentages:
No     78.0%
Yes    22.0%
Name: RainNextDay, dtype: object


In the Melbourne area, it rains on approximately 22.4% of days annually (based on our dataset's "Yes" class for RainNextDay). If you assumed it would never rain (always predicting "No"), we'd be 77.6% accurate—but this ignores all rainy days, making the model useless for predicting rain. our dataset is imbalanced (77.6% "No" vs. 22.4% "Yes")

### Split data into trainand test sets

In [57]:
# Define features and target
y = df['RainNextDay']  # Target variable (whether it will rain tomorrow)
X = df.drop(columns=['RainNextDay', 'Date', 'RainCurrentDay', 'Location'])  # Features

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
#verify splits
print("\nTraining set class distribution:")
print(y_train.value_counts(normalize=True).apply(lambda x: f"{x*100:.1f}%"))

print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True).apply(lambda x: f"{x*100:.1f}%"))


Training set class distribution:
No     78.0%
Yes    22.0%
Name: RainNextDay, dtype: object

Test set class distribution:
No     78.0%
Yes    22.0%
Name: RainNextDay, dtype: object


### Define preprocessing transformers for numerical and categorical feature

### Automatically detect numerical and categorical columns and assign them to separate numeric and categorical features

In [58]:
numeric_features=numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

### Define separate transformers for both feature types and combine them into a single preprocessing transformer

In [59]:
# Scale the numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# One-hot encode the categoricals 
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

### Combine transformers into a singe preprocessing column transformer 

In [62]:
from sklearn.impute import SimpleImputer
numerical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Create a pipeline by combining the preprocessing with a Random Forest classifier

In [63]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        class_weight='balanced',  
        random_state=42,
        n_jobs=-1))  
])

### Define parameter grid


In [64]:
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

### Perform grid search cross-validation and fit the best model to the training data

### Perform grid search cross-validation and fit the best model to the training data

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

## Instantiate and fit GridSearchCV to the pipelinepipe 
    

In [None]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)



### Diaplay model's  estimated score


In [None]:
test_score = grid_search.score(X_test, y_test)
print("Test set score: {:.2f}".format(test_score))