## Imports

In [None]:
import keras.metrics as mt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.layers import LSTM, Dense
from keras.models import Sequential
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

## Dataset Preprocessing

In [None]:
df = pd.read_csv("weatherAUS.csv") # read the raw dataset (145,460 rows)
df.drop(["Date", "Location", "WindGustDir", "WindDir9am", "WindDir3pm", "RainToday"], axis=1, inplace=True) # drop unused attributes
df = df[~df.isnull().any(axis=1)] # take rows with no Null or NaN values on all columns; (58,090 rows) ~40% of total dataset

# set for each class label to have the same amount of samples
df_no = df.query("RainTomorrow == 'No'").sample(n=len(df.query("RainTomorrow == 'Yes'"))) # out of 45,361 rows
df_yes = df.query("RainTomorrow == 'Yes'") # use all 12,729 rows

df = pd.concat([df_no, df_yes]).sort_index() # concate and sort randomized sample dataset by index

df.to_csv("readyAUS.csv", index=False, encoding="utf-8") # write pre processed dataset to a new csv file for easier access in the future

## Dataset Normalization

In [None]:
df = pd.read_csv("readyAUS.csv") # read the dataset
y = df.pop("RainTomorrow") # take RainTomorrow column as dataset class label
y = y.to_numpy() # convert class label from pandas dataframe to numpy array
X = df

# change label class value from nominal to numerical
y = pd.get_dummies(y)
y = y.values.argmax(1)

X = MinMaxScaler().fit_transform(X) # normalize dataset values between 0 and 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # split train:test = 70:30

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(criterion="gini", max_depth=6, min_samples_leaf=150, max_features=13, random_state=42)

dt = dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

# classification evaluation
print("=========== F1 SCORE ===========")
print(f1_score(y_test, y_pred).round(5))
print("======== ACCURACY SCORE ========")
print(accuracy_score(y_test, y_pred).round(5))
print("======= PRECISION SCORE ========")
print(precision_score(y_test, y_pred).round(5))
print("========= RECALL SCORE =========")
print(recall_score(y_test, y_pred).round(5))
print("================================\n")

c_m = confusion_matrix(y_test, y_pred)
dis_cm = ConfusionMatrixDisplay(confusion_matrix=c_m, display_labels = ["NotRain", "Rain"])

dis_cm.plot(cmap = plt.cm.magma)
dis_cm.ax_.set_title("\nAustralia Weather Dataset Confusion Matrix\n(Decision Tree)\n", fontsize = 12.5);
# dis_cm.ax_.figure.savefig("decision_tree.png", bbox_inches="tight")

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=250, criterion="entropy", max_depth=7, min_samples_leaf=100, max_features=11, random_state=42)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# classification evaluation
print("=========== F1 SCORE ===========")
print(f1_score(y_test, y_pred).round(5))
print("======== ACCURACY SCORE ========")
print(accuracy_score(y_test, y_pred).round(5))
print("======= PRECISION SCORE ========")
print(precision_score(y_test, y_pred).round(5))
print("========= RECALL SCORE =========")
print(recall_score(y_test, y_pred).round(5))
print("================================\n")

c_m = confusion_matrix(y_test, y_pred)
dis_cm = ConfusionMatrixDisplay(confusion_matrix=c_m, display_labels = ["NotRain", "Rain"])

dis_cm.plot(cmap = plt.cm.magma)
dis_cm.ax_.set_title("\nAustralia Weather Dataset Confusion Matrix\n(Random Forest)\n", fontsize = 12.5);
# dis_cm.ax_.figure.savefig("random_forest.png", bbox_inches="tight")

## Logistic Regression (Sklearn)

In [None]:
logreg = LogisticRegression(max_iter=250, solver="sag", multi_class="ovr", tol=0.005, random_state=42)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# classification evaluation
print("=========== F1 SCORE ===========")
print(f1_score(y_test, y_pred).round(5))
print("======== ACCURACY SCORE ========")
print(accuracy_score(y_test, y_pred).round(5))
print("======= PRECISION SCORE ========")
print(precision_score(y_test, y_pred).round(5))
print("========= RECALL SCORE =========")
print(recall_score(y_test, y_pred).round(5))
print("================================\n")

c_m = confusion_matrix(y_test, y_pred)
dis_cm = ConfusionMatrixDisplay(confusion_matrix=c_m, display_labels = ["NotRain", "Rain"])

dis_cm.plot(cmap = plt.cm.magma)
dis_cm.ax_.set_title("\nAustralia Weather Dataset Confusion Matrix\n(Logistic Regression (Sklearn))\n", fontsize = 12.5);
# dis_cm.ax_.figure.savefig("logit_sklearn.png", bbox_inches="tight")

## Logistic Regression (Scratch Code)

In [None]:
# initialize intercepts for train dan test dataset
intercept = np.ones((X_train.shape[0], 1))
X_train = np.concatenate((intercept, X_train), axis=1)
intercept = np.ones((X_test.shape[0], 1))
X_test = np.concatenate((intercept, X_test), axis=1)

# define algorithm (mini-batch gradient descent)
theta = np.zeros(X_train.shape[1]) # initialize thetas with zeros
alpha = 0.0005 # define learning rate
epochs = 10000 # number of iterations
batch_size = int(0.1 * X_train.shape[0]) # make batch size 5% of sample size

def gradient(X, err): # define gradient function
    return X.T @ err

def sigmoid(x): # define sigmoid function
    return 1 / (1 + np.exp(-x))
        
def h_theta(X, theta): # define hypothesis function
    return sigmoid(X @ theta)

def output(y_pred): # define function to convert y_pred values to int
    return np.round(y_pred)

for i in range(epochs):
    batch_X = [] # reinitialize training sample batch
    batch_Y = [] # reinitialize training class batch
    ix = np.random.randint(0, X_train.shape[0] - batch_size) # selecting random batch starting point
    batch_X = X_train[ix:ix+batch_size] # X batch size
    batch_y = y_train[ix:ix+batch_size] # y batch size

    y_pred = h_theta(batch_X, theta) # making prediction using current theta values
    err = y_pred - batch_y # calculate error rate
    grad = gradient(batch_X, err) # calculate gradient descent
    theta = theta - alpha * grad # update theta values

y_pred = output(h_theta(X_test, theta)) # making prediction based on last theta found

# classification evaluation
print("=========== F1 SCORE ===========")
print(f1_score(y_test, y_pred).round(5))
print("======== ACCURACY SCORE ========")
print(accuracy_score(y_test, y_pred).round(5))
print("======= PRECISION SCORE ========")
print(precision_score(y_test, y_pred).round(5))
print("========= RECALL SCORE =========")
print(recall_score(y_test, y_pred).round(5))
print("================================\n")

c_m = confusion_matrix(y_test, y_pred)
dis_cm = ConfusionMatrixDisplay(confusion_matrix=c_m, display_labels = ["NotRain", "Rain"])

dis_cm.plot(cmap = plt.cm.magma)
dis_cm.ax_.set_title("\nAustralia Weather Dataset Confusion Matrix\n(Logistic Regression (Scratch))\n", fontsize = 12.5);
# dis_cm.ax_.figure.savefig("logit_scratch.png", bbox_inches="tight")

## Long Short Term Memory (LSTM) (TensorFlow)

In [None]:
X_train_lstm = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model_lstm = Sequential()
model_lstm.add(LSTM(units=64, input_shape=(X_train_lstm.shape[1], 1)))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=[mt.Recall(), mt.Precision(), mt.BinaryAccuracy()])

model_lstm.fit(X_train_lstm, y_train, epochs=20, batch_size=16, validation_data=(X_test_lstm, y_test))
y_pred_lstm = (model_lstm.predict(X_test_lstm) > 0.5).astype(int)

# Evaluate LSTM model
print("=========== F1 SCORE ===========")
print(f1_score(y_test, y_pred_lstm).round(5))
print("======== ACCURACY SCORE ========")
print(accuracy_score(y_test, y_pred_lstm).round(5))
print("======= PRECISION SCORE ========")
print(precision_score(y_test, y_pred_lstm).round(5))
print("========= RECALL SCORE =========")
print(recall_score(y_test, y_pred_lstm).round(5))

# Confusion Matrix for LSTM
cm_lstm = confusion_matrix(y_test, y_pred_lstm)
disp_lstm = ConfusionMatrixDisplay(confusion_matrix=cm_lstm, display_labels=['NotRain', 'Rain'])
disp_lstm.plot(cmap=plt.cm.magma)
plt.title("\nAustralia Weather Dataset Confusion Matrix\n(LSTM)\n", fontsize = 12.5);
# plt.savefig("lstm.png", bbox_inches="tight")