# Machine Learning


In [46]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white' # Since I use a dark IDE

# To allow multiple outputs per cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection  import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# Data Preparation

## Reading in Data

In [4]:
os.chdir('/Users/benjamintan/Library/CloudStorage/OneDrive-TheUniversityofWesternAustralia/Master of Data Science/Year 2/Semester 2/CITS5553/CITS5553-Capstone-Project/ML Model/Benjamin')

In [10]:
## Full data
wba_data = pd.read_csv("./Data/wba_data_CLEAN.csv")

## Normal
X_train_norm = pd.read_csv('./Data/Normal/X_train.csv')
y_train_norm = pd.read_csv('./Data/Normal/y_train.csv')
X_test_norm = pd.read_csv('./Data/Normal/X_test.csv')
y_test_norm = pd.read_csv('./Data/Normal/y_test.csv')

## SMOTE
X_train_smote = pd.read_csv('./Data/Smote Large/X_train_smote.csv')
y_train_smote = pd.read_csv('./Data/Smote Large/y_train_smote.csv')
X_test_smote = pd.read_csv('./Data/Smote Large/X_test_smote.csv')
y_test_smote = pd.read_csv('./Data/Smote Large/y_test_smote.csv')

## Oversampling
X_train_over = pd.read_csv('./Data/Oversampling Large/X_train_over.csv')
y_train_over = pd.read_csv('./Data/Oversampling Large/y_train_over.csv')
X_test_over = pd.read_csv('./Data/Oversampling Large/X_test_over.csv')
y_test_over = pd.read_csv('./Data/Oversampling Large/y_test_over.csv')

## ADASYN
X_train_adasyn = pd.read_csv('./Data/Adasyn Large/X_train_adasyn.csv')
y_train_adasyn = pd.read_csv('./Data/Adasyn Large/y_train_adasyn.csv')
X_test_adasyn = pd.read_csv('./Data/Adasyn Large/X_test_adasyn.csv')
y_test_adasyn = pd.read_csv('./Data/Adasyn Large/y_test_adasyn.csv')


In [9]:
feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']

# Machine Learning

In [65]:
def logistic_regression(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    from sklearn.exceptions import DataConversionWarning
    from sklearn.exceptions import FitFailedWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    warnings.filterwarnings(action='ignore', category=FitFailedWarning)
    warnings.filterwarnings("ignore")
    
    from sklearn.linear_model import LogisticRegression

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]

    # Creating hyperparameters ditionary
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet']
                    } 

    # Fit GridSearch
    grid_log_reg = GridSearchCV(
        LogisticRegression(random_state=seed),
        param_grid, 
        cv = 2
    )
    grid_log_reg.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_log_reg.best_estimator_))
    log_reg = grid_log_reg.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(log_reg, X_train, y_train, cv=cv_folds)
    # print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_scores.mean(), cv_scores.std()))
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Test set
    y_pred = log_reg.predict(X_test)
    print(classification_report(y_pred, y_test))
    


In [None]:
def support_vector_machine(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    from sklearn.exceptions import DataConversionWarning
    from sklearn.exceptions import FitFailedWarning
    warnings.filterwarnings(action='ignore', category=DataConversionWarning)
    warnings.filterwarnings(action='ignore', category=FitFailedWarning)
    warnings.filterwarnings("ignore")
    
    from sklearn.linear_model import LogisticRegression

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]

    # Creating hyperparameters ditionary
    param_grid = {'penalty': ['l1', 'l2', 'elasticnet']
                    } 

    # Fit GridSearch
    grid_log_reg = GridSearchCV(
        LogisticRegression(random_state=seed),
        param_grid, 
        cv = 2
    )
    grid_log_reg.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_log_reg.best_estimator_))
    log_reg = grid_log_reg.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(log_reg, X_train, y_train, cv=cv_folds)
    # print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_scores.mean(), cv_scores.std()))
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Test set
    y_pred = log_reg.predict(X_test)
    print(classification_report(y_pred, y_test))
    


In [67]:
# %%capture
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='reduced')
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='reduced')

Best model: LogisticRegression(random_state=42)
5-fold cross validation:
  accuracy: 0.95
  std dev: 0.03
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        41
           1       0.88      1.00      0.93         7

    accuracy                           0.98        48
   macro avg       0.94      0.99      0.96        48
weighted avg       0.98      0.98      0.98        48



ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1138, in fit
    X, y = self._validate_data(
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1090, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1111, in _check_y
    y = column_or_1d(y, warn=True)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1156, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (4250, 2) instead.

--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Applications/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.
