# Setup

In [1]:
# Impot modules
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score

In [2]:
# Create/Map directories
if not os.path.exists('figure'): os.mkdir('figure')
if not os.path.exists('result'): os.mkdir('result')
data_path = os.path.join(os.getcwd(), "data")
figure_path = os.path.join(os.getcwd(), "figure")
result_path = os.path.join(os.getcwd(), "result")

# Import

In [None]:
sheet_to_df_map = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name=None)
full = pd.concat(sheet_to_df_map, axis=0, ignore_index=True)

recategorised_data = pd.read_csv(os.path.join(data_path,"recategorised_data.csv"))

# Tidy

In [None]:
full.dropna()
full["Class"] = np.where(full["Entero"]>=280, 1, 0)
full = full.drop(["Entero"], axis=1)

recategorised_data = recategorised_data.drop(["Entero", "RainWA", "BeachName", "Wspeed", "Wdirection", "BeachDirection"], axis=1)
recategorised_data['DATE']=recategorised_data['DATE'].astype('datetime64')
recategorised_data = recategorised_data.reindex(columns=(list([c for c in recategorised_data.columns if c != "Entero_level"]) + ["Entero_level"]))

# Exploration

In [None]:
# Raw data
full

In [None]:
# Stats
figure = full.describe()
figure.to_csv(os.path.join(figure_path, "stats.csv"))

In [None]:
# Scatter plots
figure = sns.pairplot(full, hue="Class").figure
figure.set_size_inches(16, 10)
figure.savefig(os.path.join(figure_path, "scatter_plots"), bbox_inches="tight")

In [None]:
# Correlation heatmap
figure = sns.heatmap(full.corr()).get_figure()
figure.set_size_inches(16, 10)
figure.savefig(os.path.join(figure_path, "correlation_heatmap"), bbox_inches="tight")

# Modelling

## Setup

In [None]:
recategorised_data.info()
confusion_matrix

In [None]:
# Set random state
RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

# Order data by date
recategorised_data = recategorised_data.sort_values(by=['DATE'])
recategorised_data.drop("DATE", axis=1)

# Extract features and label
def extract(dataframe):
    X = np.array(dataframe.iloc[:, 1:-1])
    y = np.array(dataframe.iloc[:, -1])
    return X, y

# hyperparameters to tune
n = range(10)

# Timeseries split
ts = TimeSeriesSplit(n_splits=4)

# Cross-validation split
kf = KFold(n_splits=5)

# Pre-allocate space for results
a = np.zeros((ts.n_splits, kf.n_splits, len(n)))

## KNN

In [None]:
# Multi-split the data into train sets and test sets in a timely manner
ts_idx = -1
for train_index, test_index in ts.split(recategorised_data):    
    ts_idx += 1
    train, test = recategorised_data.iloc[train_index, :], recategorised_data.iloc[test_index, :]

    # Cross-validate train sets to get the best hyperparameter(s)
    kf_idx = -1
    for train_index, validation_index in kf.split(train):
        kf_idx += 1
        X_train, y_train = extract(train.iloc[train_index, :])
        X_valdn, y_valdn = extract(train.iloc[validation_index, :])

        # Fit the scaler to X_train, and then use it to transform both the train set and the test set
        transfromer = preprocessing.Normalizer().fit(X_train[:, 0:3])
        X_train[:, 0:3] = transfromer.transform(X_train[:, 0:3])
        X_valdn[:, 0:3] = transfromer.transform(X_valdn[:, 0:3])

        # Oversample the train set with ADASYN
        adasyn = ADASYN(sampling_strategy="minority", n_neighbors=1)
        X_train, y_train = adasyn.fit_resample(X_train, y_train)

        # Test hyperparameters
        for i in n:
            knn = KNeighborsClassifier(n_neighbors=i+1)
            knn.fit(X_train, y_train)
            a[ts_idx, kf_idx, i] = recall_score(y_valdn, knn.predict(X_valdn))

hyperparameters = np.mean(a, axis=(0,1))
print("The best KNN is with %s neighbour(s)" %(np.argmax(hyperparameters)+1))