### Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import IPython.display
import matplotlib.pyplot as plt
import importlib
import src.utils
from src.utils.preprocessing import standardize, dummy_code, oversample, remove_correlated_features
from src.utils.get_data import import_data
from src.utils.model_helpers import roc_w_cross_val, AUC_all_models, homemade_all_models
from src.utils.model_helpers import cross_validation
from src.utils.feature_engineering import RFE_, train_optimal_features_model, get_optimal_features_model

# Preprocessing
from imblearn.over_sampling import SMOTE
# ML models
from sklearn.metrics import plot_roc_curve, auc, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as Lda
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [9]:
%load_ext autoreload
%autoreload 1

### Load Data

In [2]:
DATA_PATH = '../../data'
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=True)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=True)
no_data, no_labels = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=True)

# For later processing rename the index
no_data.index = no_data.index.rename('subject')

### Preprocessing

In [3]:
# rename
X_coarse = coarse_data
X_fine = fine_data
X_no = no_data

#### Normalisation

In [4]:
# For case where is_user_features=True
X_coarse = standardize(X_coarse, 0, -3)
X_fine = standardize(X_fine, 0, -3)
X_no = standardize(X_no, 0, -3)

#### Dummy code categorical features

In [5]:
X_coarse = dummy_code(X_coarse, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_fine = dummy_code(X_fine, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_no = dummy_code(X_no, columns = ['Gender', 'Resp_Condition', 'Symptoms'])

#### Drop correlated features

In [6]:
# Drop features with a Pearson correlation > 0.9 to prevent multicollinearity
X_no = remove_correlated_features(X_no, 0.95)
X_coarse = remove_correlated_features(X_coarse, 0.95)
X_fine = remove_correlated_features(X_fine, 0.95)

#### Get optimal features model

In [7]:
#X_coarse_opt = get_optimal_features_model(X_fine, fine_labels, model=GradientBoostingClassifier(random_state=0), start_idx = 1)

### Model training

In [13]:
coarse_results = AUC_all_models(X_coarse, coarse_labels.Label, k=6, oversampling=True)
fine_results = AUC_all_models(X_fine, fine_labels.Label, k=6, oversampling=True)
no_results = AUC_all_models(X_no, no_labels.Label, k=6, oversampling=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [14]:
coarse_results = coarse_results.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results = fine_results.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results = no_results.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results, fine_results["Fine_AUC"], no_results["No_Seg_AUC"]], axis=1, sort=False)
# Most promising seem to be RandomForest and GradientBoosting --> Tune their hyperparameters!

Unnamed: 0,Models,Coarse_AUC,Fine_AUC,No_Seg_AUC
0,LogisticRegression,0.582126,0.575572,0.588832
1,SVM,0.573342,0.577179,0.581497
2,LDA,0.580749,0.57873,0.5964
3,KNN,0.555671,0.602944,0.554159
4,GaussianNB,0.498425,0.562296,0.586074
5,DecisionTree,0.556305,0.540323,0.553375
6,RandomForest,0.592845,0.577431,0.602959
7,GradientBoosting,0.573105,0.567573,0.562576
