### Libraries

In [117]:
import pandas as pd
import numpy as np
import seaborn as sns
import IPython.display
import matplotlib.pyplot as plt
import importlib
import src.utils
from src.utils.preprocessing import standardize, dummy_code, oversample, remove_correlated_features
from src.utils.get_data import import_data, expert_models
from src.utils.model_helpers import roc_w_cross_val, AUC_all_models, homemade_all_models
from src.utils.model_helpers import cross_validation
from src.utils.feature_engineering import RFE_, train_optimal_features_model, get_optimal_features_model

# Preprocessing
from imblearn.over_sampling import SMOTE
# ML models
from sklearn.metrics import plot_roc_curve, auc, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as Lda
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


ImportError: cannot import name 'split_experts' from 'src.utils.get_data' (/home/mogan/Desktop/cs-433-project-2-cough_classifier/src/utils/get_data.py)

In [118]:
%load_ext autoreload
%autoreload 1


import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [93]:
DATA_PATH = '../../data'
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=True)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=True)
no_data, no_labels = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=True)

# For later processing rename the index
no_data.index = no_data.index.rename('subject')

### Preprocessing

In [94]:
# rename
X_coarse = coarse_data
X_fine = fine_data
X_no = no_data

#### Normalisation

In [98]:
# For case where is_user_features=True
X_coarse = standardize(X_coarse, 0, -5)
X_fine = standardize(X_fine, 0, -5)
X_no = standardize(X_no, 0, -5)

# Standardise the age
X_coarse = standardize(X_coarse, -4, -3)
X_fine = standardize(X_fine, -4, -3)
X_no = standardize(X_no, -4, -3)

#### Dummy code categorical features

In [100]:
X_coarse = dummy_code(X_coarse, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_fine = dummy_code(X_fine, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_no = dummy_code(X_no, columns = ['Gender', 'Resp_Condition', 'Symptoms'])

#### Drop correlated features

In [101]:
# Drop features with a Pearson correlation > 0.9 to prevent multicollinearity
X_no = remove_correlated_features(X_no, 0.95)
X_coarse = remove_correlated_features(X_coarse, 0.95)
X_fine = remove_correlated_features(X_fine, 0.95)

#### Get optimal features model

In [7]:
#X_coarse_opt = get_optimal_features_model(X_fine, fine_labels, model=GradientBoostingClassifier(random_state=0), start_idx = 1)

### Model training for each expert with additional features

In [119]:
def expert_models(X, y, oversampling = True):
    
    # Split the data according to which expert labeled it
    merged = X.merge(y, left_index=True, right_index = True)
    
    X_exp_1 = merged[merged['Expert'] == 1].iloc[:,:-1].drop(columns = ['Expert'], axis = 1)
    y_exp_1 = merged[merged['Expert'] == 1].iloc[:,-1]
    
    X_exp_2 = merged[merged['Expert'] == 2].iloc[:,:-1].drop(columns = ['Expert'], axis = 1)
    y_exp_2 = merged[merged['Expert'] == 2].iloc[:,-1]
    
    X_exp_3 = merged[merged['Expert'] == 3].iloc[:,:-1] .drop(columns = ['Expert'], axis = 1)
    y_exp_3 = merged[merged['Expert'] == 3].iloc[:,-1]
    
    # All expert groups are about 1000 samples big
    #print(len(X_exp_1), len(X_exp_2), len(X_exp_3))
    
    # Train all models for all experts
    exp_1 = AUC_all_models(X_exp_1, y_exp_1, k=6, oversampling=oversampling)
    exp_2 = AUC_all_models(X_exp_2, y_exp_2, k=6, oversampling=oversampling)
    exp_3 = AUC_all_models(X_exp_3, y_exp_3, k=6, oversampling=oversampling)
    
    # Gather the results in a df
    exp_1 = exp_1.rename(columns={'AUC (mean)': "Exp_1_AUC"})
    exp_2 = exp_2.rename(columns={'AUC (mean)': "Exp_2_AUC"})
    exp_3 = exp_3.rename(columns={'AUC (mean)': "Exp_3_AUC"})

    results = pd.concat([exp_1, exp_2["Exp_2_AUC"], exp_3["Exp_3_AUC"]], axis=1, sort=False)
    
    return results

In [145]:
coarse_results_experts = expert_models(X_coarse, coarse_labels)
fine_results_experts = expert_models(X_fine, fine_labels)
no_results_experts = expert_models(X_no, no_labels)

In [146]:
coarse_results_experts['Data'] = 'coarse'
fine_results_experts['Data'] = 'fine'
no_results_experts['Data'] = 'no_segmentation'
display(coarse_results_experts, fine_results_experts, no_results_experts)

Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.637157,0.539389,0.656209,coarse
1,SVM,0.644801,0.526959,0.641057,coarse
2,LDA,0.657257,0.531174,0.642537,coarse
3,KNN,0.635085,0.529898,0.640847,coarse
4,GaussianNB,0.615633,0.509398,0.544291,coarse
5,DecisionTree,0.58495,0.525679,0.593299,coarse
6,RandomForest,0.656332,0.49657,0.660337,coarse
7,GradientBoosting,0.619253,0.532204,0.666541,coarse


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.640347,0.555213,0.661229,fine
1,SVM,0.631595,0.551421,0.657826,fine
2,LDA,0.6333,0.55582,0.660011,fine
3,KNN,0.634405,0.609653,0.667124,fine
4,GaussianNB,0.63464,0.52576,0.592695,fine
5,DecisionTree,0.607212,0.53332,0.631005,fine
6,RandomForest,0.658725,0.55781,0.697429,fine
7,GradientBoosting,0.610539,0.534997,0.699068,fine


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.624789,0.478895,0.601293,no_segmentation
1,SVM,0.651701,0.480037,0.602699,no_segmentation
2,LDA,0.644702,0.487214,0.602357,no_segmentation
3,KNN,0.559225,0.520973,0.578345,no_segmentation
4,GaussianNB,0.552034,0.566775,0.639811,no_segmentation
5,DecisionTree,0.515731,0.530172,0.56267,no_segmentation
6,RandomForest,0.601457,0.509614,0.634973,no_segmentation
7,GradientBoosting,0.585384,0.489799,0.614388,no_segmentation


### Model training all data

In [15]:
oversampling = True
coarse_results = AUC_all_models(X_coarse, coarse_labels.Label, k=6, oversampling=oversampling)
fine_results = AUC_all_models(X_fine, fine_labels.Label, k=6, oversampling=oversampling)
no_results = AUC_all_models(X_no, no_labels.Label, k=6, oversampling=oversampling)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [34]:
coarse_results = coarse_results.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results = fine_results.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results = no_results.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results, fine_results["Fine_AUC"], no_results["No_Seg_AUC"]], axis=1, sort=False)
# Most promising seem to be RandomForest and GradientBoosting --> Tune their hyperparameters!

Unnamed: 0,Models,Coarse_AUC,Fine_AUC,No_Seg_AUC
0,LogisticRegression,0.534213,0.533288,0.53938
1,SVM,0.5,0.500761,0.5
2,LDA,0.534146,0.535025,0.537539
3,KNN,0.52612,0.523591,0.510807
4,GaussianNB,0.49812,0.559975,0.563672
5,DecisionTree,0.555856,0.587821,0.517209
6,RandomForest,0.505505,0.510379,0.505578
7,GradientBoosting,0.549927,0.539885,0.515346


In [14]:
coarse_results = coarse_results.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results = fine_results.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results = no_results.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results, fine_results["Fine_AUC"], no_results["No_Seg_AUC"]], axis=1, sort=False)
# Most promising seem to be RandomForest and GradientBoosting --> Tune their hyperparameters!

Unnamed: 0,Models,Coarse_AUC,Fine_AUC,No_Seg_AUC
0,LogisticRegression,0.582126,0.575572,0.588832
1,SVM,0.573342,0.577179,0.581497
2,LDA,0.580749,0.57873,0.5964
3,KNN,0.555671,0.602944,0.554159
4,GaussianNB,0.498425,0.562296,0.586074
5,DecisionTree,0.556305,0.540323,0.553375
6,RandomForest,0.592845,0.577431,0.602959
7,GradientBoosting,0.573105,0.567573,0.562576


### Models without expert features

In [136]:
DATA_PATH = '../../data'
coarse_data_n, coarse_labels_n = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=False)
fine_data_n, fine_labels_n = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=False)
no_data_n, no_labels_n = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=False)

# For later processing rename the index
no_data_n.index = no_data_n.index.rename('subject')

In [137]:
# rename
X_coarse_n = coarse_data_n
X_fine_n = fine_data_n
X_no_n = no_data_n

In [141]:
X_coarse_n

Unnamed: 0_level_0,Unnamed: 1_level_0,EEPD50_100,EEPD100_150,EEPD150_200,EEPD200_250,EEPD250_300,EEPD300_350,EEPD350_400,EEPD400_450,EEPD450_500,EEPD500_550,...,MFCC_std12,Crest_Factor,Cough_Length,PSD_225-425,PSD_450-550,PSD_1325-1600,PSD_1600-2000,PSD_2500-2900,PSD_3100-3700,Expert
subject,file_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,0.454240,0.448162,0.102272,-0.236572,0.081297,0.717220,0.060670,0.051099,0.036793,0.351724,...,0.369328,1.008936,0.136902,0.605304,-0.616080,-0.656271,-0.417656,0.859915,-0.492460,3.0
008ba489-31ad-44d8-856b-fcf72369dc46,1,-1.521214,-1.190178,-1.523487,-1.213530,-1.546543,-1.541268,-1.221698,-1.227298,-1.235239,-1.225759,...,-0.179656,1.460108,-1.407821,-0.835642,0.431621,-0.679282,-0.728698,0.803331,-0.548814,3.0
008c1c9e-aeef-40c5-846c-24f1b964f884,0,-0.862730,-0.862510,-0.873183,-0.887877,-0.244271,-0.250703,-0.580514,-0.588100,-0.599223,-0.594766,...,-0.274683,-0.433811,-0.499212,-0.401558,-0.518448,-0.623283,-0.423602,0.220028,0.460474,3.0
008c1c9e-aeef-40c5-846c-24f1b964f884,1,-1.191972,-0.534842,-0.548032,-0.236572,-0.895407,-0.573345,-0.259922,-0.588100,-0.599223,-0.910263,...,-0.795560,-0.810475,-0.432299,-1.116004,-0.312412,-0.334785,-0.321448,0.377785,0.554527,3.0
00bf9f83-2e8f-47cf-a4f2-97f2beceebc1,0,-0.204245,-0.207174,-0.222880,0.089081,-0.244271,0.071938,-0.259922,-0.268500,0.354801,-0.279269,...,-0.047083,-0.887753,-0.277049,0.279953,-0.308101,-0.575058,-0.732071,-0.610926,-0.630125,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff5f97db-9b64-4e35-afe8-af463d5c2c60,1,-0.204245,-0.207174,-0.548032,-0.236572,-0.569839,0.071938,-0.259922,-0.268500,0.354801,-0.279269,...,1.493505,-0.520958,-0.008370,-1.232757,-0.716252,0.380512,-0.329251,0.090799,0.319342,1.0
ff8435f6-76b5-42c1-8f4c-7479710e71bf,0,-0.533487,-0.534842,-0.548032,-0.562225,-0.244271,-0.573345,-0.580514,-0.268500,-0.281215,-0.279269,...,-0.610057,2.835017,-0.245060,0.388082,-0.015478,0.026173,-0.811514,-0.478413,-0.627483,3.0
ff8435f6-76b5-42c1-8f4c-7479710e71bf,1,-0.533487,-0.207174,-0.222880,0.089081,0.081297,-0.250703,-0.259922,0.051099,-0.281215,-0.594766,...,-0.400734,2.797139,-0.285120,0.638830,0.800611,-0.287016,-0.789991,-0.580027,-0.554440,3.0
ff8bfcc9-3df2-4752-8280-63f023fba31c,0,0.124997,0.120494,0.102272,0.089081,-0.244271,-0.250703,-0.259922,-0.268500,0.036793,0.036227,...,-0.807935,-0.932226,-0.072789,-0.718353,-0.397488,-0.083521,0.146171,-0.579935,-0.673398,1.0


#### Normalisation

In [140]:
# For case where is_user_features=False
X_coarse_n = standardize(X_coarse_n, 0, -1)
X_fine_n = standardize(X_fine_n, 0, -1)
X_no_n = standardize(X_no_n, 0, -1)

#### Drop correlated features

In [142]:
# Drop features with a Pearson correlation > 0.9 to prevent multicollinearity
X_no_n = remove_correlated_features(X_no_n, 0.95)
X_coarse_n = remove_correlated_features(X_coarse_n, 0.95)
X_fine_n = remove_correlated_features(X_fine_n, 0.95)

### Train model for each expert without additional features

In [147]:
coarse_results_experts_n = expert_models(X_coarse_n, coarse_labels_n)
fine_results_experts_n = expert_models(X_fine_n, fine_labels_n)
no_results_experts_n = expert_models(X_no_n, no_labels_n)

In [149]:
coarse_results_experts_n['Data'] = 'coarse'
fine_results_experts_n['Data'] = 'fine'
no_results_experts_n['Data'] = 'no_segmentation'
display(coarse_results_experts_n, fine_results_experts_n, no_results_experts_n)

Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.654778,0.56886,0.638187,coarse
1,SVM,0.66709,0.555536,0.63545,coarse
2,LDA,0.674634,0.566695,0.6277,coarse
3,KNN,0.629359,0.537272,0.64545,coarse
4,GaussianNB,0.620304,0.504691,0.547275,coarse
5,DecisionTree,0.589397,0.53086,0.590049,coarse
6,RandomForest,0.656887,0.518623,0.656826,coarse
7,GradientBoosting,0.608157,0.523568,0.65622,coarse


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.651857,0.576043,0.650634,fine
1,SVM,0.656575,0.575327,0.655019,fine
2,LDA,0.665062,0.576502,0.651679,fine
3,KNN,0.633579,0.598581,0.666519,fine
4,GaussianNB,0.623828,0.541054,0.590206,fine
5,DecisionTree,0.573791,0.507718,0.566471,fine
6,RandomForest,0.648308,0.54503,0.658535,fine
7,GradientBoosting,0.569641,0.52884,0.663622,fine


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.661438,0.509811,0.597696,no_segmentation
1,SVM,0.631391,0.514491,0.598442,no_segmentation
2,LDA,0.647578,0.507826,0.586907,no_segmentation
3,KNN,0.570736,0.516984,0.564617,no_segmentation
4,GaussianNB,0.560218,0.576621,0.65257,no_segmentation
5,DecisionTree,0.529014,0.531616,0.588228,no_segmentation
6,RandomForest,0.614096,0.538977,0.607519,no_segmentation
7,GradientBoosting,0.592311,0.497984,0.584409,no_segmentation


In [150]:
# Compare: with expert features:
display(coarse_results_experts, fine_results_experts, no_results_experts)

Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.637157,0.539389,0.656209,coarse
1,SVM,0.644801,0.526959,0.641057,coarse
2,LDA,0.657257,0.531174,0.642537,coarse
3,KNN,0.635085,0.529898,0.640847,coarse
4,GaussianNB,0.615633,0.509398,0.544291,coarse
5,DecisionTree,0.58495,0.525679,0.593299,coarse
6,RandomForest,0.656332,0.49657,0.660337,coarse
7,GradientBoosting,0.619253,0.532204,0.666541,coarse


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.640347,0.555213,0.661229,fine
1,SVM,0.631595,0.551421,0.657826,fine
2,LDA,0.6333,0.55582,0.660011,fine
3,KNN,0.634405,0.609653,0.667124,fine
4,GaussianNB,0.63464,0.52576,0.592695,fine
5,DecisionTree,0.607212,0.53332,0.631005,fine
6,RandomForest,0.658725,0.55781,0.697429,fine
7,GradientBoosting,0.610539,0.534997,0.699068,fine


Unnamed: 0,Models,Exp_1_AUC,Exp_2_AUC,Exp_3_AUC,Data
0,LogisticRegression,0.624789,0.478895,0.601293,no_segmentation
1,SVM,0.651701,0.480037,0.602699,no_segmentation
2,LDA,0.644702,0.487214,0.602357,no_segmentation
3,KNN,0.559225,0.520973,0.578345,no_segmentation
4,GaussianNB,0.552034,0.566775,0.639811,no_segmentation
5,DecisionTree,0.515731,0.530172,0.56267,no_segmentation
6,RandomForest,0.601457,0.509614,0.634973,no_segmentation
7,GradientBoosting,0.585384,0.489799,0.614388,no_segmentation


### Train model for all data without additional features

In [151]:
oversampling = True
coarse_results_n = AUC_all_models(X_coarse_n, coarse_labels_n.Label, k=6, oversampling=oversampling)
fine_results_n = AUC_all_models(X_fine_n, fine_labels_n.Label, k=6, oversampling=oversampling)
no_results_n = AUC_all_models(X_no_n, no_labels_n.Label, k=6, oversampling=oversampling)

In [152]:
coarse_results_n = coarse_results_n.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results_n = fine_results_n.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results_n = no_results_n.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results_n, fine_results_n["Fine_AUC"], no_results_n["No_Seg_AUC"]], axis=1, sort=False)

Unnamed: 0,Models,Coarse_AUC,Fine_AUC,No_Seg_AUC
0,LogisticRegression,0.665278,0.668373,0.652621
1,SVM,0.660725,0.669927,0.640732
2,LDA,0.66549,0.668754,0.656511
3,KNN,0.566628,0.60252,0.574999
4,GaussianNB,0.499666,0.574214,0.614173
5,DecisionTree,0.614619,0.62145,0.568382
6,RandomForest,0.674398,0.66109,0.663244
7,GradientBoosting,0.671097,0.671115,0.648956
