In [1]:
# Impot modules
import os
import math
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from imblearn.over_sampling import SMOTENC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

#from autorank import autorank, plot_stats, create_report, latex_table

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
recategorised_data = pd.read_csv("recategorised_data.csv")

In [3]:
recategorised_data = recategorised_data.drop(["Entero", "RainWA", "BeachName", "Wspeed_level", "Wdirection_level", "BeachDirection"], axis=1)

In [4]:
# Recategorsied data for modelling
# recategorised_data = recategorised_data.drop(["Entero", "RainWA", "BeachName", "Wspeed", "Wdirection", "BeachDirection"], axis=1)

# Converted to datatime64 for ordering
recategorised_data["DATE"] = recategorised_data["DATE"].astype("datetime64")

# Converted to catogory for oversampling
#recategorised_data["Wspeed_level"] = recategorised_data["Wspeed_level"].astype("category")
recategorised_data["BeachType"] = recategorised_data["BeachType"].astype("category")
recategorised_data["on_offshore"] = recategorised_data["on_offshore"].astype("category")
#recategorised_data["Wdirection_level"] = recategorised_data["Wdirection_level"].astype("category")

# Scale down the categorical features to (0, 1)
#recategorised_data["Wspeed_level"] = recategorised_data["Wspeed_level"].apply(lambda x: x/recategorised_data['Wspeed_level'].nunique())
recategorised_data["BeachType"] = recategorised_data["BeachType"].apply(lambda x: x/recategorised_data['BeachType'].nunique())
recategorised_data["on_offshore"] = recategorised_data["on_offshore"].apply(lambda x: x/recategorised_data['on_offshore'].nunique())
#recategorised_data["Wdirection_level"] = recategorised_data["Wdirection_level"].apply(lambda x: x/recategorised_data['Wdirection_level'].nunique())

# Rearrange the features, so numerics comes first for nomalisation
recategorised_data = recategorised_data.reindex(columns=(list([c for c in recategorised_data.columns if c != "Entero_level"]) + ["Entero_level"]))

In [5]:
recategorised_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2017 entries, 0 to 2016
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   DATE          2017 non-null   datetime64[ns]
 1   Rain24        2017 non-null   float64       
 2   Rain48        2017 non-null   float64       
 3   Rain72        2017 non-null   float64       
 4   Wdirection    2017 non-null   int64         
 5   Wspeed        2017 non-null   float64       
 6   Solarhours    2017 non-null   float64       
 7   BeachType     2017 non-null   category      
 8   on_offshore   2017 non-null   category      
 9   Entero_level  2017 non-null   int64         
dtypes: category(2), datetime64[ns](1), float64(5), int64(2)
memory usage: 130.4 KB


In [6]:
# Set random state
RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

# Set test size
test_size = 201

# Set the number of splits
n_split = 5

# Set the number of observations at each step
n_obsn = 40

# Order data by date
recategorised_data = recategorised_data.sort_values(by=['DATE'])
recategorised_data = recategorised_data.drop("DATE", axis=1)

# Extract features and label (X-predictor variables, y-response variable-last column)
def extract(dataframe):
    X = dataframe.iloc[:, 0:-1]
    y = dataframe.iloc[:, -1]
    return X, y

# Timeseries split for rolling "cross validation"
ts = TimeSeriesSplit(n_splits=n_split)

# Create train/validation set which excluds the test set for evaluation
cv_dataset = recategorised_data.iloc[:-test_size, :]
print("Train/validation set info:")
cv_dataset.info()

# The DataFrame to store model performances for autorank
df = pd.DataFrame(columns=["KNN", "BDT", "ANN"])
dfAcc = pd.DataFrame(columns=["KNN", "BDT", "ANN"])
dfSp = pd.DataFrame(columns=["KNN", "BDT", "ANN"])

Train/validation set info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1816 entries, 0 to 1157
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Rain24        1816 non-null   float64 
 1   Rain48        1816 non-null   float64 
 2   Rain72        1816 non-null   float64 
 3   Wdirection    1816 non-null   int64   
 4   Wspeed        1816 non-null   float64 
 5   Solarhours    1816 non-null   float64 
 6   BeachType     1816 non-null   category
 7   on_offshore   1816 non-null   category
 8   Entero_level  1816 non-null   int64   
dtypes: category(2), float64(5), int64(2)
memory usage: 117.3 KB


In [7]:
# Feature correlation analysis 
print("Feature correlation analysis:")
print(cv_dataset.corr())

Feature correlation analysis:
                Rain24    Rain48    Rain72  Wdirection    Wspeed  Solarhours  \
Rain24        1.000000  0.708130  0.538913    0.017648  0.070324   -0.241831   
Rain48        0.708130  1.000000  0.828864    0.055917  0.085008   -0.173784   
Rain72        0.538913  0.828864  1.000000    0.055594  0.075347   -0.169193   
Wdirection    0.017648  0.055917  0.055594    1.000000 -0.062360    0.139224   
Wspeed        0.070324  0.085008  0.075347   -0.062360  1.000000   -0.071385   
Solarhours   -0.241831 -0.173784 -0.169193    0.139224 -0.071385    1.000000   
Entero_level  0.170314  0.182479  0.166256   -0.052237  0.089490   -0.051465   

              Entero_level  
Rain24            0.170314  
Rain48            0.182479  
Rain72            0.166256  
Wdirection       -0.052237  
Wspeed            0.089490  
Solarhours       -0.051465  
Entero_level      1.000000  


In [8]:
# Feature selection with time split validation:

# Pre-allocate space for results
acc = np.zeros((ts.n_splits, 9))
sen = np.zeros((ts.n_splits, 9))
spe = np.zeros((ts.n_splits, 9))
tsfs =  [[[]] * 9  for j in range(ts.n_splits)]
c=cv_dataset.iloc[:,0:8].columns

for feature_number in range(1, 9):

    # Multi-split the data into train sets and validation sets in a timely manner
    ts_idx = -1
    for train_index, validation_index in ts.split(cv_dataset):    
        ts_idx += 1
        train, validation = cv_dataset.iloc[train_index, :], cv_dataset.iloc[validation_index, :]
        
        X_train, y_train = extract(train)
        X_valdn, y_valdn = extract(validation)
        
        # Fit the scaler to X_train, and then use it to transform both the train set and the test set
        transfromer = preprocessing.Normalizer().fit(X_train.iloc[:, 0:6])
        X_train.iloc[:, 0:6] = transfromer.transform(X_train.iloc[:, 0:6])
        X_valdn.iloc[:, 0:6] = transfromer.transform(X_valdn.iloc[:, 0:6])
        #print("X_train.head():",X_train.head())
        #print("X_valdn.head():",X_valdn.head())

        # Oversample the train set with SMOTENC
        smotenc = SMOTENC(categorical_features=[X_train.dtypes=="category"], sampling_strategy="minority", k_neighbors=1)
        X_train, y_train = smotenc.fit_resample(X_train, y_train)

        # Run feature selection
        # Feature extraction-Recursive Feature Elimination (or RFE)
        #rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=feature_number)
        rfe = RFE(estimator=AdaBoostClassifier(), n_features_to_select=feature_number)
        #rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=feature_number)
        
        fit = rfe.fit(X_train, y_train)
        print("Num Features: %s" % (fit.n_features_))
        print("Selected Features: %s" % (c[fit.support_].values))
        print("Feature Ranking: %s" % (fit.ranking_))
        tsfs[ts_idx][feature_number] = list(c[fit.support_].values)
        #print(tsfs_f)
            
        cm1 = confusion_matrix(y_valdn, fit.predict(X_valdn))
        total1=sum(sum(cm1))
        #####from confusion matrix calculate accuracy
        acc[ts_idx, feature_number] = (cm1[0,0]+cm1[1,1])/total1
        sen[ts_idx, feature_number] = cm1[1,1]/(cm1[1,0]+cm1[1,1])
        spe[ts_idx, feature_number] = cm1[0,0]/(cm1[0,0]+cm1[0,1])
        # print(confusion_matrix(y_valdn, fit.predict(X_valdn)))
print("Accuracy:",acc)
print("Sensitivity:",sen)
print("Specificity:",spe)
#tsfs

Num Features: 1
Selected Features: ['Solarhours']
Feature Ranking: [8 7 4 2 3 1 5 6]
Num Features: 1
Selected Features: ['Rain72']
Feature Ranking: [3 6 1 2 4 5 7 8]
Num Features: 1
Selected Features: ['Rain48']
Feature Ranking: [5 1 4 2 6 3 8 7]
Num Features: 1
Selected Features: ['Solarhours']
Feature Ranking: [5 4 6 3 2 1 8 7]
Num Features: 1
Selected Features: ['Rain72']
Feature Ranking: [6 5 1 4 3 2 8 7]
Num Features: 2
Selected Features: ['Rain72' 'Solarhours']
Feature Ranking: [4 6 1 3 2 1 7 5]
Num Features: 2
Selected Features: ['Rain72' 'Wdirection']
Feature Ranking: [4 3 1 1 5 2 6 7]
Num Features: 2
Selected Features: ['Rain24' 'Rain48']
Feature Ranking: [1 1 5 2 3 4 7 6]
Num Features: 2
Selected Features: ['Rain48' 'Solarhours']
Feature Ranking: [3 1 4 5 2 1 7 6]
Num Features: 2
Selected Features: ['Rain72' 'Wspeed']
Feature Ranking: [5 4 1 2 1 3 7 6]
Num Features: 3
Selected Features: ['Wdirection' 'Wspeed' 'Solarhours']
Feature Ranking: [5 3 2 1 1 1 6 4]
Num Features: 3
Se

In [9]:
#print("Best accuracy:", np.max(acc), "Features selected:", tsfs[math.floor(np.argmax(acc)/9)][np.argmax(acc)%9])
#print("Best sensitivity:", np.max(sen), "Features selected:", tsfs[math.floor(np.argmax(sen)/9)][np.argmax(sen)%9])
#print("Best specificity:", np.max(spe), "Features selected:", tsfs[math.floor(np.argmax(spe)/9)][np.argmax(spe)%9])

In [10]:
# Best feature combination by average accuracy
acc_mean = np.mean(acc, axis=0)
for i in range(5): 
    print(tsfs[i][np.argmax(acc_mean)], end = " ") 
    print() 

['Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Solarhours', 'BeachType'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 


In [11]:
# Best feature combination by average sensitivity
sen_mean = np.mean(sen, axis=0)
for i in range(5): 
    print(tsfs[i][np.argmax(sen_mean)], end = " ") 
    print() 

['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore'] 


In [12]:
# Best feature combination by average specificity
spe_mean = np.mean(spe, axis=0)
for i in range(5): 
    print(tsfs[i][np.argmax(spe_mean)], end = " ") 
    print() 

['Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'on_offshore'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Solarhours', 'BeachType'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 
['Rain24', 'Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours'] 


In [13]:
# Choose feature combination for the final model
col = tsfs[math.floor(np.argmax(sen)/9)][np.argmax(sen)%9]
print(col)
if 'on_offshore' not in col:
    col = col + ['on_offshore']
col = col + ['Entero_level']
#col = ['Rain48', 'Rain72', 'Wdirection', 'Wspeed', 'Solarhours', 'BeachType', 'on_offshore']
col

['Wdirection', 'Wspeed', 'Solarhours']


['Wdirection', 'Wspeed', 'Solarhours', 'on_offshore', 'Entero_level']

In [14]:
#cv_dataset = cv_dataset[col]
cv_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1816 entries, 0 to 1157
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Rain24        1816 non-null   float64 
 1   Rain48        1816 non-null   float64 
 2   Rain72        1816 non-null   float64 
 3   Wdirection    1816 non-null   int64   
 4   Wspeed        1816 non-null   float64 
 5   Solarhours    1816 non-null   float64 
 6   BeachType     1816 non-null   category
 7   on_offshore   1816 non-null   category
 8   Entero_level  1816 non-null   int64   
dtypes: category(2), float64(5), int64(2)
memory usage: 117.3 KB
