# Machine Learning


In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white' # Since I use a dark IDE

# To allow multiple outputs per cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection  import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# Data Preparation

## Reading in Data

In [None]:
os.chdir('/Users/benjamintan/Library/CloudStorage/OneDrive-TheUniversityofWesternAustralia/Master of Data Science/Year 2/Semester 2/CITS5553/CITS5553-Capstone-Project/ML Model/Benjamin')

In [None]:
## Full data
wba_data = pd.read_csv("./Data/wba_data_CLEAN.csv")

## Normal
X_train_norm = pd.read_csv('./Data/Normal/X_train.csv')
y_train_norm = pd.read_csv('./Data/Normal/y_train.csv')
X_test_norm = pd.read_csv('./Data/Normal/X_test.csv')
y_test_norm = pd.read_csv('./Data/Normal/y_test.csv')

## SMOTE
X_train_smote = pd.read_csv('./Data/Smote Large/X_train_smote.csv')
y_train_smote = pd.read_csv('./Data/Smote Large/y_train_smote.csv')
X_test_smote = pd.read_csv('./Data/Smote Large/X_test_smote.csv')
y_test_smote = pd.read_csv('./Data/Smote Large/y_test_smote.csv')

## Oversampling
X_train_over = pd.read_csv('./Data/Oversampling Large/X_train_over.csv')
y_train_over = pd.read_csv('./Data/Oversampling Large/y_train_over.csv')
X_test_over = pd.read_csv('./Data/Oversampling Large/X_test_over.csv')
y_test_over = pd.read_csv('./Data/Oversampling Large/y_test_over.csv')

## ADASYN
X_train_adasyn = pd.read_csv('./Data/Adasyn Large/X_train_adasyn.csv')
y_train_adasyn = pd.read_csv('./Data/Adasyn Large/y_train_adasyn.csv')
X_test_adasyn = pd.read_csv('./Data/Adasyn Large/X_test_adasyn.csv')
y_test_adasyn = pd.read_csv('./Data/Adasyn Large/y_test_adasyn.csv')

## Generated
X_train_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_train.csv', usecols = X_train_norm.columns)
y_train_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_train.csv', usecols = ['OverallPoF'])
X_test_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_test.csv', usecols = X_train_norm.columns)
y_test_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_test.csv', usecols=['OverallPoF'])



dfs = [X_train_norm,y_train_norm,X_test_norm,y_test_norm,
X_train_smote, y_train_smote,X_test_smote,y_test_smote,
X_train_over,y_train_over,X_test_over,y_test_over,
X_train_adasyn,y_train_adasyn,X_test_adasyn,y_test_adasyn]

# Delete Unnamed: 0 columns if they are there
for df in dfs:
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']

# Machine Learning

In [None]:
def logistic_regression(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    warnings.filterwarnings("ignore")
    
    from sklearn.linear_model import LogisticRegression

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]
    elif vars == 'freqs_reduced':
        feat_select = ['f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]


    # Creating hyperparameters ditionary
    saga_grid = {'solver': ['saga'],
                    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
                    'l1_ratio': [r/10 for r in list(range(0,10,1))]
                    }
    liblinear_grid = {'solver': ['liblinear'],
                        'penalty': ['l1', 'l2']}
    
    parameters = [saga_grid, liblinear_grid]

    # Fit GridSearch
    grid_log_reg = GridSearchCV(
        LogisticRegression(random_state=seed),
        parameters, 
        cv = 2
    )
    grid_log_reg.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_log_reg.best_estimator_))
    log_reg = grid_log_reg.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(log_reg, X_train, y_train, cv=cv_folds)
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Train set
    print("Training Data")
    y_train_pred = log_reg.predict(X_train)
    print(classification_report(y_train_pred, y_train))

    # Test set
    print("Test Data")
    y_test_pred = log_reg.predict(X_test)
    print(classification_report(y_test_pred, y_test))

In [None]:
def support_vector_machine(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    warnings.filterwarnings("ignore")
    
    from sklearn.svm import SVC

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]
    elif vars == 'freqs_reduced':
        feat_select = ['f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]

    # Creating hyperparameters ditionary
    params_linear = {'C': [0.1, 1, 10, 100, 1000], 
                'kernel': ['linear']} 
    params_nonlinear = {'C': [0.1, 1, 10, 100, 1000],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'auto'], 
                'kernel': ['poly', 'rbf', 'sigmoid']}

    parameters = [params_linear, params_nonlinear]


    # Fit GridSearch
    grid_svm = GridSearchCV(
        SVC(random_state=seed),
        parameters, 
        cv = 2
    )
    grid_svm.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_svm.best_estimator_))
    svm = grid_svm.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(svm, X_train, y_train, cv=cv_folds)
    # print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_scores.mean(), cv_scores.std()))
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Train set
    print("Training Data")
    y_train_pred = svm.predict(X_train)
    print(classification_report(y_train_pred, y_train))

    # Test set
    print("Test Data")
    y_test_pred = svm.predict(X_test)
    print(classification_report(y_test_pred, y_test))
    


## Logistic Regression

### Original Data

In [None]:
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='reduced')

In [None]:
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs_reduced')

### Oversampling

In [None]:
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='reduced')

In [None]:
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='freqs_reduced')

### SMOTE

In [None]:
logistic_regression(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='reduced')

In [None]:
logistic_regression(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs_reduced')

### ADASYN

In [None]:
logistic_regression(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='reduced')

In [None]:
logistic_regression(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs_reduced')

## Support Vector Machine

### Original Data

In [None]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='reduced')

In [None]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs')

In [None]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs_reduced')

### Oversampling

In [None]:
support_vector_machine(X_train_over, y_train_over, X_test_over, y_test_over, vars='freqs')

In [None]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs')

In [None]:
support_vector_machine(X_train_over, y_train_over, X_test_over, y_test_over, vars='freqs_reduced')

### SMOTE

In [None]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='reduced')

In [None]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs')

In [None]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs_reduced')

### ADASYN

In [None]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='reduced')

In [None]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs')

In [None]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs_reduced')