# Machine Learning


In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white' # Since I use a dark IDE

# To allow multiple outputs per cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection  import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# Data Preparation

## Reading in Data

In [2]:
os.chdir('/Users/benjamintan/Library/CloudStorage/OneDrive-TheUniversityofWesternAustralia/Master of Data Science/Year 2/Semester 2/CITS5553/CITS5553-Capstone-Project/ML Model/Benjamin')

In [34]:
## Full data
wba_data = pd.read_csv("./Data/wba_data_CLEAN.csv")

## Normal
X_train_norm = pd.read_csv('./Data/Normal/X_train.csv')
y_train_norm = pd.read_csv('./Data/Normal/y_train.csv')
X_test_norm = pd.read_csv('./Data/Normal/X_test.csv')
y_test_norm = pd.read_csv('./Data/Normal/y_test.csv')

## SMOTE
X_train_smote = pd.read_csv('./Data/Smote Large/X_train_smote.csv')
y_train_smote = pd.read_csv('./Data/Smote Large/y_train_smote.csv')
X_test_smote = pd.read_csv('./Data/Smote Large/X_test_smote.csv')
y_test_smote = pd.read_csv('./Data/Smote Large/y_test_smote.csv')

## Oversampling
X_train_over = pd.read_csv('./Data/Oversampling/X_train_over.csv')
y_train_over = pd.read_csv('./Data/Oversampling/y_train_over.csv')
X_test_over = pd.read_csv('./Data/Oversampling/X_test_over.csv')
y_test_over = pd.read_csv('./Data/Oversampling/y_test_over.csv')

## ADASYN
X_train_adasyn = pd.read_csv('./Data/Adasyn Large/X_train_adasyn.csv')
y_train_adasyn = pd.read_csv('./Data/Adasyn Large/y_train_adasyn.csv')
X_test_adasyn = pd.read_csv('./Data/Adasyn Large/X_test_adasyn.csv')
y_test_adasyn = pd.read_csv('./Data/Adasyn Large/y_test_adasyn.csv')

## Generated
X_train_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_train.csv', usecols = X_train_norm.columns)
y_train_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_train.csv', usecols = ['OverallPoF'])
X_test_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_test.csv', usecols = X_train_norm.columns)
y_test_gen = pd.read_csv('./Data/VAE (Experimental)/X_gen_test.csv', usecols=['OverallPoF'])



dfs = [X_train_norm,y_train_norm,X_test_norm,y_test_norm,
X_train_smote, y_train_smote,X_test_smote,y_test_smote,
X_train_over,y_train_over,X_test_over,y_test_over,
X_train_adasyn,y_train_adasyn,X_test_adasyn,y_test_adasyn]

# Delete Unnamed: 0 columns if they are there
for df in dfs:
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)

In [35]:
feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']

# Machine Learning

In [27]:
def logistic_regression(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    warnings.filterwarnings("ignore")
    
    from sklearn.linear_model import LogisticRegression

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]
    elif vars == 'freqs_reduced':
        feat_select = ['f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]


    # Creating hyperparameters ditionary
    saga_grid = {'solver': ['saga'],
                    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
                    'l1_ratio': [r/10 for r in list(range(0,10,1))]
                    }
    liblinear_grid = {'solver': ['liblinear'],
                        'penalty': ['l1', 'l2']}
    
    parameters = [saga_grid, liblinear_grid]

    # Fit GridSearch
    grid_log_reg = GridSearchCV(
        LogisticRegression(random_state=seed),
        parameters, 
        cv = 2
    )
    grid_log_reg.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_log_reg.best_estimator_))
    log_reg = grid_log_reg.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(log_reg, X_train, y_train, cv=cv_folds)
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Train set
    print("Training Data")
    y_train_pred = log_reg.predict(X_train)
    print(classification_report(y_train_pred, y_train))

    # Test set
    print("Test Data")
    y_test_pred = log_reg.predict(X_test)
    print(classification_report(y_test_pred, y_test))

    # Coefficients
    print("Coefficients:")
    coefs = zip(list(X_train.columns), log_reg.coef_.tolist()[0])

        # Print top 5 coefficients
    coefs = sorted(coefs, key = lambda x: abs(x[1]), reverse = True)
    for coef in coefs[:5]:
        print(coef)

In [6]:
def support_vector_machine(X_train, y_train, X_test, y_test, vars = 'reduced', seed=42, cv_folds=5):
    import warnings
    warnings.filterwarnings("ignore")
    
    from sklearn.svm import SVC

    if vars == 'full':
        pass
    elif vars == 'reduced':
        feat_select = ['TPP', 'TympType', 'OAE1', 'OAE1.4', 'OAE2', 'OAE2.8', 'OAE4','f(408.4789)', 'f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]
    elif vars == 'freqs':
        freq_cols = [c for c in X_train.columns if c[:2] == "f("]
        X_train = X_train[freq_cols]
        X_test = X_test[freq_cols]
    elif vars == 'conts':
        cont_cols = [c for c in X_train.columns if c[:2] != "f("]
        X_train = X_train[cont_cols]
        X_test = X_test[cont_cols]
    elif vars == 'freqs_reduced':
        feat_select = ['f(2593.6791)', 'f(2378.4142)', 'f(2310.7054)', 'f(7127.1897)', 'f(865.5366)', 'f(6727.1713)', 'f(226.0000)', 'f(458.5020)', 'f(500.0000)', 'f(1029.3022)', 'f(5993.2283)', 'f(1887.7486)', 'f(1373.9536)', 'f(667.4199)', 'f(2747.9073)', 'f(1296.8396)', 'f(577.6763)', 'f(1155.3527)', 'f(1090.5077)']
        X_train = X_train[feat_select]
        X_test = X_test[feat_select]

    # Creating hyperparameters ditionary
    params_linear = {'C': [0.1, 1, 10, 100, 1000], 
                'kernel': ['linear']} 
    params_nonlinear = {'C': [0.1, 1, 10, 100, 1000],
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001,'auto'], 
                'kernel': ['poly', 'rbf', 'sigmoid']}

    parameters = [params_linear, params_nonlinear]


    # Fit GridSearch
    grid_svm = GridSearchCV(
        SVC(random_state=seed),
        parameters, 
        cv = 2
    )
    grid_svm.fit(X_train, y_train)

    # Extract best estimator
    print("Best model: {}".format(grid_svm.best_estimator_))
    svm = grid_svm.best_estimator_

    # Cross validation
    cv_scores = cross_val_score(svm, X_train, y_train, cv=cv_folds)
    # print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_scores.mean(), cv_scores.std()))
    print("{0}-fold cross validation:\n  accuracy: {1}\n  std dev: {2}".format(cv_folds, round(cv_scores.mean(), 2), round(cv_scores.std(), 2)))

    # Train set
    print("Training Data")
    y_train_pred = svm.predict(X_train)
    print(classification_report(y_train_pred, y_train))

    # Test set
    print("Test Data")
    y_test_pred = svm.predict(X_test)
    print(classification_report(y_test_pred, y_test))
    


## Logistic Regression

### Original Data

In [32]:
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.95
  std dev: 0.03
Training Data
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       165
           1       0.83      0.96      0.89        26

    accuracy                           0.97       191
   macro avg       0.91      0.97      0.94       191
weighted avg       0.97      0.97      0.97       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        41
           1       0.88      1.00      0.93         7

    accuracy                           0.98        48
   macro avg       0.94      0.99      0.96        48
weighted avg       0.98      0.98      0.98        48

Coefficients:
('OAE1.4', -0.1368159934662426)
('OAE2', -0.1111201604628667)
('OAE1', -0.10548826300117627)
('OAE2.8', -0.06802635110824189)
('OAE4', -0.055145739530662534)

In [29]:
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs_reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='l1', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.93
  std dev: 0.03
Training Data
              precision    recall  f1-score   support

           0       0.98      0.93      0.96       169
           1       0.63      0.86      0.73        22

    accuracy                           0.93       191
   macro avg       0.81      0.90      0.84       191
weighted avg       0.94      0.93      0.93       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.62      1.00      0.77         5

    accuracy                           0.94        48
   macro avg       0.81      0.97      0.87        48
weighted avg       0.96      0.94      0.94        48

Coefficients:
('f(1090.5077)', -2.717236683194348)
('f(1155.3527)', -2.226855022335862)
('f(1296.8396)', -2.1391507081074383)
('f(1373.9536)', -1.4904850824880262)
('f(1029.

In [38]:
logistic_regression(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='conts')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.95
  std dev: 0.03
Training Data
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       165
           1       0.83      0.96      0.89        26

    accuracy                           0.97       191
   macro avg       0.91      0.97      0.94       191
weighted avg       0.97      0.97      0.97       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        41
           1       0.88      1.00      0.93         7

    accuracy                           0.98        48
   macro avg       0.94      0.99      0.96        48
weighted avg       0.98      0.98      0.98        48

Coefficients:
('OAE1.4', -0.13122325142380506)
('OAE2', -0.10617872189254472)
('OAE1', -0.1018423600386256)
('OAE2.8', -0.05954917264912355)
('OAE4', -0.04830993718598464)

### Oversampling

In [36]:
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.04
Training Data
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       169
           1       1.00      0.99      1.00       172

    accuracy                           1.00       341
   macro avg       1.00      1.00      1.00       341
weighted avg       1.00      1.00      1.00       341

Test Data
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        29
           1       1.00      0.94      0.97        32

    accuracy                           0.97        61
   macro avg       0.97      0.97      0.97        61
weighted avg       0.97      0.97      0.97        61

Coefficients:
('TympType', 9.2869966023568)
('f(2747.9073)', 0.7052166099510111)
('OAE1.4', -0.361744080704117)
('OAE4', -0.357868142991216)
('OAE2', -0.31898601867111515)


In [37]:
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='freqs_reduced')

Best model: LogisticRegression(random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.87
  std dev: 0.03
Training Data
              precision    recall  f1-score   support

           0       0.91      0.84      0.87       183
           1       0.83      0.90      0.86       158

    accuracy                           0.87       341
   macro avg       0.87      0.87      0.87       341
weighted avg       0.87      0.87      0.87       341

Test Data
              precision    recall  f1-score   support

           0       0.87      0.90      0.89        30
           1       0.90      0.87      0.89        31

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61

Coefficients:
('f(1373.9536)', -1.9858702246942666)
('f(1296.8396)', -1.8315633361540868)
('f(1887.7486)', -1.2613736011754701)
('f(1090.5077)', -1.1096838702190832)
('f(1155.3527)', -1.1007409601

In [39]:
logistic_regression(X_train_over, y_train_over, X_test_over, y_test_over, vars='conts')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.99
  std dev: 0.02
Training Data
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       168
           1       1.00      0.99      0.99       173

    accuracy                           0.99       341
   macro avg       0.99      0.99      0.99       341
weighted avg       0.99      0.99      0.99       341

Test Data
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      0.97      0.98        31

    accuracy                           0.98        61
   macro avg       0.98      0.98      0.98        61
weighted avg       0.98      0.98      0.98        61

Coefficients:
('TympType', 9.635430218956389)
('OAE2', -0.44663605231535775)
('OAE1.4', -0.369355808465783)
('AgeY', 0.34800387696357304)
('OAE4', -0.32919128417846055)


### SMOTE

In [12]:
logistic_regression(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4254
           1       1.00      1.00      1.00      4246

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       748
           1       1.00      0.99      1.00       752

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [13]:
logistic_regression(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs_reduced')

Best model: LogisticRegression(random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.89
  std dev: 0.01
Training Data
              precision    recall  f1-score   support

           0       0.92      0.87      0.89      4497
           1       0.86      0.91      0.89      4003

    accuracy                           0.89      8500
   macro avg       0.89      0.89      0.89      8500
weighted avg       0.89      0.89      0.89      8500

Test Data
              precision    recall  f1-score   support

           0       0.91      0.84      0.88       809
           1       0.83      0.90      0.87       691

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500



In [40]:
logistic_regression(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='conts')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4257
           1       1.00      1.00      1.00      4243

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       751
           1       1.00      1.00      1.00       749

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500

Coefficients:
('TympType', 13.930901195413139)
('SC', -1.1853309361542106)
('ECV', -0.6685476761320475)
('Ear coded', 0.6277536069583277)
('OAE1.4', -0.5475101666330268)


### ADASYN

In [14]:
logistic_regression(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4254
           1       1.00      1.00      1.00      4248

    accuracy                           1.00      8502
   macro avg       1.00      1.00      1.00      8502
weighted avg       1.00      1.00      1.00      8502

Test Data
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       745
           1       1.00      0.99      1.00       756

    accuracy                           1.00      1501
   macro avg       1.00      1.00      1.00      1501
weighted avg       1.00      1.00      1.00      1501



In [15]:
logistic_regression(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs_reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4183
           1       0.98      0.96      0.97      4319

    accuracy                           0.97      8502
   macro avg       0.97      0.97      0.97      8502
weighted avg       0.97      0.97      0.97      8502

Test Data
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       722
           1       0.98      0.94      0.96       779

    accuracy                           0.96      1501
   macro avg       0.96      0.96      0.96      1501
weighted avg       0.96      0.96      0.96      1501



In [41]:
logistic_regression(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='conts')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4256
           1       1.00      1.00      1.00      4246

    accuracy                           1.00      8502
   macro avg       1.00      1.00      1.00      8502
weighted avg       1.00      1.00      1.00      8502

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       752
           1       1.00      1.00      1.00       749

    accuracy                           1.00      1501
   macro avg       1.00      1.00      1.00      1501
weighted avg       1.00      1.00      1.00      1501

Coefficients:
('TympType', 18.380907440895577)
('ECV', -7.474618353318853)
('OAE2', -0.8858288256618637)
('Ear coded', -0.6759661386356035)
('OAE1', -0.40201226376577437)


## Support Vector Machine

### Original Data

In [16]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='reduced')

Best model: SVC(C=10, gamma=0.001, random_state=42)
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.03
Training Data
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       160
           1       1.00      0.97      0.98        31

    accuracy                           0.99       191
   macro avg       1.00      0.98      0.99       191
weighted avg       0.99      0.99      0.99       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.75      1.00      0.86         6

    accuracy                           0.96        48
   macro avg       0.88      0.98      0.92        48
weighted avg       0.97      0.96      0.96        48



In [17]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs')

Best model: SVC(C=0.1, gamma=0.1, kernel='poly', random_state=42)
5-fold cross validation:
  accuracy: 0.94
  std dev: 0.04
Training Data
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       165
           1       0.73      0.85      0.79        26

    accuracy                           0.94       191
   macro avg       0.85      0.90      0.87       191
weighted avg       0.94      0.94      0.94       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.75      1.00      0.86         6

    accuracy                           0.96        48
   macro avg       0.88      0.98      0.92        48
weighted avg       0.97      0.96      0.96        48



In [18]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs_reduced')

Best model: SVC(C=1, kernel='linear', random_state=42)
5-fold cross validation:
  accuracy: 0.93
  std dev: 0.04
Training Data
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       167
           1       0.67      0.83      0.74        24

    accuracy                           0.93       191
   macro avg       0.82      0.89      0.85       191
weighted avg       0.94      0.93      0.93       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.75      1.00      0.86         6

    accuracy                           0.96        48
   macro avg       0.88      0.98      0.92        48
weighted avg       0.97      0.96      0.96        48



### Oversampling

In [19]:
support_vector_machine(X_train_over, y_train_over, X_test_over, y_test_over, vars='reduced')

Best model: SVC(C=10, kernel='linear', random_state=42)
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4250
           1       1.00      1.00      1.00      4250

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       750

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [104]:
support_vector_machine(X_train_over, y_train_over, X_test_over, y_test_over, vars='freqs')

Best model: SVC(C=0.1, gamma=1, kernel='poly', random_state=42)
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4250
           1       1.00      1.00      1.00      4250

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       750

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [105]:
support_vector_machine(X_train_norm, y_train_norm, X_test_norm, y_test_norm, vars='freqs')

Best model: SVC(C=0.1, gamma=0.1, kernel='poly', random_state=42)
5-fold cross validation:
  accuracy: 0.94
  std dev: 0.04
Training Data
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       165
           1       0.73      0.85      0.79        26

    accuracy                           0.94       191
   macro avg       0.85      0.90      0.87       191
weighted avg       0.94      0.94      0.94       191

Test Data
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.75      1.00      0.86         6

    accuracy                           0.96        48
   macro avg       0.88      0.98      0.92        48
weighted avg       0.97      0.96      0.96        48



### SMOTE

In [107]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='reduced')

Best model: SVC(C=1000, kernel='linear', random_state=42)
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4250
           1       1.00      1.00      1.00      4250

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       750

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [108]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs')

Best model: SVC(C=1, gamma=1, kernel='poly', random_state=42)
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4250
           1       1.00      1.00      1.00      4250

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       750

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [109]:
support_vector_machine(X_train_smote, y_train_smote, X_test_smote, y_test_smote, vars='freqs_reduced')

Best model: SVC(C=1000, gamma=1, random_state=42)
5-fold cross validation:
  accuracy: 1.0
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4250
           1       1.00      1.00      1.00      4250

    accuracy                           1.00      8500
   macro avg       1.00      1.00      1.00      8500
weighted avg       1.00      1.00      1.00      8500

Test Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       750
           1       1.00      1.00      1.00       750

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



### ADASYN

In [110]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='reduced')

In [None]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs')

In [None]:
support_vector_machine(X_train_adasyn, y_train_adasyn, X_test_adasyn, y_test_adasyn, vars='freqs_reduced')

# Testing Generative Data

## Logistic Regression


In [42]:
logistic_regression(X_train_gen, y_train_gen, X_test_norm, y_test_norm, vars='freqs_reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97      9724
         1.0       0.98      0.97      0.97     10076

    accuracy                           0.97     19800
   macro avg       0.97      0.97      0.97     19800
weighted avg       0.97      0.97      0.97     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.93      0.90      0.91        41
         1.0       0.50      0.57      0.53         7

    accuracy                           0.85        48
   macro avg       0.71      0.74      0.72        48
weighted avg       0.86      0.85      0.86        48

Coefficients:
('f(7127.1897)', 45.53271161255919)
('f(458.5020)', -42.016059123072175)
('f(1090.5077)', -28.263832468068955)
('f(1296.8396)', 27.92044175988089)
('f(577.676

In [7]:
logistic_regression(X_train_gen, y_train_gen, X_test_norm, y_test_norm, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.98
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      9687
         1.0       0.99      0.97      0.98     10113

    accuracy                           0.98     19800
   macro avg       0.98      0.98      0.98     19800
weighted avg       0.98      0.98      0.98     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90        40
         1.0       0.50      0.50      0.50         8

    accuracy                           0.83        48
   macro avg       0.70      0.70      0.70        48
weighted avg       0.83      0.83      0.83        48



In [43]:
logistic_regression(X_train_gen, y_train_gen, X_test_over, y_test_over, vars='freqs_reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97      9724
         1.0       0.98      0.97      0.97     10076

    accuracy                           0.97     19800
   macro avg       0.97      0.97      0.97     19800
weighted avg       0.97      0.97      0.97     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.90      0.68      0.78        41
         1.0       0.57      0.85      0.68        20

    accuracy                           0.74        61
   macro avg       0.73      0.77      0.73        61
weighted avg       0.79      0.74      0.75        61

Coefficients:
('f(7127.1897)', 45.53271161255919)
('f(458.5020)', -42.016059123072175)
('f(1090.5077)', -28.263832468068955)
('f(1296.8396)', 27.92044175988089)
('f(577.676

In [44]:
logistic_regression(X_train_gen, y_train_gen, X_test_over, y_test_over, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.98
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      9687
         1.0       0.99      0.97      0.98     10113

    accuracy                           0.98     19800
   macro avg       0.98      0.98      0.98     19800
weighted avg       0.98      0.98      0.98     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.90      0.74      0.81        38
         1.0       0.67      0.87      0.75        23

    accuracy                           0.79        61
   macro avg       0.78      0.80      0.78        61
weighted avg       0.81      0.79      0.79        61

Coefficients:
('f(458.5020)', -42.13786812963798)
('f(7127.1897)', 38.81504487542164)
('f(1296.8396)', 26.638339358573948)
('f(1090.5077)', -24.570656736008107)
('f(408.4789)', -22.82

In [47]:
logistic_regression(X_train_gen, y_train_gen, X_test_smote, y_test_smote, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.98
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      9687
         1.0       0.99      0.97      0.98     10113

    accuracy                           0.98     19800
   macro avg       0.98      0.98      0.98     19800
weighted avg       0.98      0.98      0.98     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.92      0.70      0.79       985
         1.0       0.61      0.88      0.72       515

    accuracy                           0.76      1500
   macro avg       0.76      0.79      0.76      1500
weighted avg       0.81      0.76      0.77      1500

Coefficients:
('f(458.5020)', -42.13786812963798)
('f(7127.1897)', 38.81504487542164)
('f(1296.8396)', 26.638339358573948)
('f(1090.5077)', -24.570656736008107)
('f(408.4789)', -22.82

In [48]:
logistic_regression(X_train_gen, y_train_gen, X_test_smote, y_test_smote, vars='freqs_reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97      9724
         1.0       0.98      0.97      0.97     10076

    accuracy                           0.97     19800
   macro avg       0.97      0.97      0.97     19800
weighted avg       0.97      0.97      0.97     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.92      0.68      0.78      1003
         1.0       0.58      0.87      0.70       497

    accuracy                           0.75      1500
   macro avg       0.75      0.78      0.74      1500
weighted avg       0.80      0.75      0.75      1500

Coefficients:
('f(7127.1897)', 45.53271161255919)
('f(458.5020)', -42.016059123072175)
('f(1090.5077)', -28.263832468068955)
('f(1296.8396)', 27.92044175988089)
('f(577.676

In [49]:
logistic_regression(X_train_gen, y_train_gen, X_test_adasyn, y_test_adasyn, vars='reduced')

Best model: LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
5-fold cross validation:
  accuracy: 0.98
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      9687
         1.0       0.99      0.97      0.98     10113

    accuracy                           0.98     19800
   macro avg       0.98      0.98      0.98     19800
weighted avg       0.98      0.98      0.98     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.94      0.65      0.77      1079
         1.0       0.50      0.89      0.64       422

    accuracy                           0.72      1501
   macro avg       0.72      0.77      0.71      1501
weighted avg       0.82      0.72      0.73      1501

Coefficients:
('f(458.5020)', -42.13786812963798)
('f(7127.1897)', 38.81504487542164)
('f(1296.8396)', 26.638339358573948)
('f(1090.5077)', -24.570656736008107)
('f(408.4789)', -22.82

In [50]:
logistic_regression(X_train_gen, y_train_gen, X_test_adasyn, y_test_adasyn, vars='freqs_reduced')

Best model: LogisticRegression(l1_ratio=0.0, penalty='none', random_state=42, solver='saga')
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97      9724
         1.0       0.98      0.97      0.97     10076

    accuracy                           0.97     19800
   macro avg       0.97      0.97      0.97     19800
weighted avg       0.97      0.97      0.97     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.93      0.64      0.76      1089
         1.0       0.48      0.87      0.62       412

    accuracy                           0.70      1501
   macro avg       0.70      0.76      0.69      1501
weighted avg       0.81      0.70      0.72      1501

Coefficients:
('f(7127.1897)', 45.53271161255919)
('f(458.5020)', -42.016059123072175)
('f(1090.5077)', -28.263832468068955)
('f(1296.8396)', 27.92044175988089)
('f(577.676

## Support Vector Machine

In [11]:
support_vector_machine(X_train_gen, y_train_gen, X_test_norm, y_test_norm, vars='freqs_reduced')

Best model: SVC(C=100, gamma=1, kernel='poly', random_state=42)
5-fold cross validation:
  accuracy: 0.98
  std dev: 0.0
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      9698
         1.0       0.99      0.97      0.98     10102

    accuracy                           0.98     19800
   macro avg       0.98      0.98      0.98     19800
weighted avg       0.98      0.98      0.98     19800

Test Data
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89        39
         1.0       0.50      0.44      0.47         9

    accuracy                           0.81        48
   macro avg       0.69      0.67      0.68        48
weighted avg       0.80      0.81      0.81        48



In [7]:
support_vector_machine(X_train_gen[:1000], y_train_gen[:1000], X_test_norm, y_test_norm, vars='reduced')

Best model: SVC(C=100, kernel='linear', random_state=42)
5-fold cross validation:
  accuracy: 0.97
  std dev: 0.01
Training Data
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       510
         1.0       0.99      0.97      0.98       490

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000

Test Data
              precision    recall  f1-score   support

         0.0       0.82      0.92      0.87        36
         1.0       0.62      0.42      0.50        12

    accuracy                           0.79        48
   macro avg       0.72      0.67      0.68        48
weighted avg       0.78      0.79      0.78        48

