# Setup

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.over_sampling import ADASYN
from sklearn.metrics import recall_score

In [None]:
# Map directories
data_path = os.path.join(os.getcwd(), "data")
plot_path = os.path.join(os.getcwd(), "plot")
result_path = os.path.join(os.getcwd(), "result")

# Import

In [None]:
clarks = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name="Clarks")
narrow_neck = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name="Narrow Neck")
judges_bay = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name="Judges Bay")
weymouth = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name="Weymouth")
milford = pd.read_excel(os.path.join(data_path, "full.xlsx"), sheet_name="Milford")
full = pd.concat([clarks, narrow_neck, judges_bay, weymouth, milford], axis=0, ignore_index=True)

# Tidy

In [None]:
full.dropna()
full["Class"] = np.where(full["Entero"]>=280, 1, 0)
full = full.drop(["Entero"], axis=1)

# Exploration

In [None]:
# Raw data
full

In [None]:
# Stats
figure = full.describe()
figure.to_csv(os.path.join(result_path, "stats.csv"))

In [None]:
# Scatter plots
figure = sns.pairplot(full, hue="Class").figure
figure.set_size_inches(16, 10)
figure.savefig(os.path.join(plot_path, "scatter_plots"), bbox_inches="tight")

In [None]:
# Correlation heatmap
figure = sns.heatmap(full.corr()).get_figure()
figure.set_size_inches(16, 10)
figure.savefig(os.path.join(plot_path, "correlation_heatmap"), bbox_inches="tight")

# Modelling

## Setup

In [None]:
# Features and label
X = full.iloc[:, 1:-1]
y = full.iloc[:, -1]

# Loops
r = range(100)                   # number of repetitions
n = range(10)  # hyperparameters to tune
s = np.zeros((len(r), len(n)))   # pre-allocate space

## KNN

In [None]:
# Repeate the process for average scores
for i in r:
    # Split the train set and the test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    # Fit the scaler to X_train, and then use it to transform both the train set and the test set
    transfromer = preprocessing.Normalizer().fit(X_train)
    X_train = transfromer.transform(X_train)
    X_test = transfromer.transform(X_test)

    # Oversample the train set with ADASYN
    ada = ADASYN(sampling_strategy="minority")
    X_train, y_train = ada.fit_resample(X_train, y_train)

    # Tune hyperparameters
    for j in n:
        knn = KNeighborsClassifier(n_neighbors=j+1)
        knn.fit(X_train, y_train)
        s[i, j] = recall_score(y_test, knn.predict(X_test))
        
results = np.mean(s, axis=0)
print("The best KNN is with %s neighbour(s)" %(np.argmax(results)+1))