# Feature selection by category or reference paper
This tests all classifiers by feature set

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import MinMaxScaler 

In [2]:
data = pd.read_csv("OSA_complete_patients.csv", index_col = 0)
# this data file was created using the RF_no_nan_data.ipynb file which should be in the missing value imputation folder

In [3]:
data.head()

Unnamed: 0,PatientID,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,...,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Severity
39,23,2.0,57.883641,0.0,0.0,0.0,172.0,90.0,45.0,125.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.0,4.0,5.0,3
41,24,2.0,60.796715,0.0,0.0,0.0,156.0,85.0,35.0,113.0,...,0.0,1.0,1.0,0.0,0.0,1.0,19.0,17.0,4.0,3
46,28,1.0,63.438741,0.0,0.0,0.0,178.0,68.0,35.0,73.0,...,0.0,0.0,1.0,0.0,0.0,1.0,5.0,3.0,0.0,1
55,32,1.0,28.736482,0.0,0.0,0.0,180.0,69.0,36.0,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
56,33,1.0,56.80219,0.0,0.0,0.0,185.0,118.0,43.0,106.0,...,0.0,0.0,0.0,0.0,0.0,1.0,12.0,13.0,2.0,3


In [4]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']

### Wu et al. [25] 
age; body mass index; Epworth Sleepiness
Scale; waistline; neck circumference; and
difference of blood pressure before going
to sleep and early in the morning

In [5]:
wu = ['Age','BMI','Epworth_scale',  'Abdominal_perimeter', 'Cervical_perimeter','Systolic_BP', 'Diastolic_BP']
# The Wu paper included a feature described as "The difference of blood presure before going to sleep and early in the morning." 
# This dataset did not include that feature, so it is substituted here with disastolic and systolic BP

### Mencar et al. [15]
body mass index; gender; and Epworth
Sleepiness Scale

In [6]:
mencar = ['BMI', 'Sex', 'Epworth_scale']

### Huang et al. [12] 
age; waistline; neck circumference; snoring;
sleep onset latency; and witnessed apnea

In [7]:
huang = ['Age','Cervical_perimeter', 'Snoring']
# The Huang paper included the features: sleep onset latency and witnessed apnea
# This dataset did not included sleep onset latency data, and including results for witnessed apnea would skew the results so it was not included

### Ustun et al. [21] 
age; body mass index; gender; diabetes; hypertension; and tabagism (smoker status)

In [8]:
ustun = ['Age','BMI', 'Sex','Diabetes', 'High_BP', 'Current_smoker', 'Former_smoker']

### Rodruiges Jr et al.
age; nocturia frequency; body mass index;
depression score; neck circumference; hip
measurement; diastolic blood pressure; and
tabagism

In [9]:
rodruiges = ['Age', 'Nocturia', 'BMI', 'Depression_scale', 'Cervical_perimeter', 'Abdominal_perimeter', 'Diastolic_BP', 'Current_smoker', 'Former_smoker' ]

### Categories

In [10]:
demographic = ['Sex','Age','Current_smoker','Former_smoker','Sedentary']

In [11]:
measurements = ['Height','Weight','Cervical_perimeter',
               'Abdominal_perimeter','Systolic_BP','Diastolic_BP','Maxillofacial_profile','BMI','High_BP']

In [12]:
comorbidities = ['Asthma','Rhinitis','COPD','Respiratory_fail','Myocardial_infarct','Coronary_fail','Arrhythmias','Stroke',
                 'Heart_fail','Arteriopathy','Gastric_reflux','Glaucoma','Diabetes','Hypercholesterolemia','Hypertriglyceridemia',
                 'Hypo(er)thyroidism','Depression','Obesity','Dysmorphology','Restless_Leg_Syndrome']

In [13]:
symptoms=['Snoring','Diurnal_somnolence','Driving_drowsiness','Morning_fatigue','Morning_headache','Memory_problem',
          'Nocturnal_perspiration','Shortness_of_breath_on_exertion','Nocturia','Drowsiness_accident','Near_miss_accident',
          'Respiratory_arrest']

In [14]:
questionnaires = ['Epworth_scale','Pichots_scale','Depression_scale']

## Models

### All features

In [22]:
X=data[features]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [23]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression(class_weight='balanced', random_state = 0, solver = 'lbfgs', max_iter = 2000)

# Logistic Regression Cross-Validated
from sklearn.linear_model import LogisticRegressionCV
models['Logistic Regression CV'] = LogisticRegressionCV(class_weight='balanced', random_state = 0, cv = 5, max_iter = 2000)


# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier(class_weight='balanced', random_state = 0)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier(class_weight='balanced', random_state = 0, n_estimators = 100)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

# Ridge Classifier
from sklearn.linear_model import RidgeClassifier
models['Ridge Classifier']= RidgeClassifier(class_weight='balanced', random_state = 0)

# Ridge cross-validated classifier
#from sklearn.linear_model import RidgeClassifierCV
#models['Ridge Classifier CV']=RidgeClassifierCV( class_weight='balanced')   # this takes about 15 minutes to run

#XGBoost
from xgboost import XGBClassifier
models['XGBoost']=XGBClassifier()

#LightGBM
from lightgbm import LGBMClassifier
models['LGBM']=LGBMClassifier()

#CATboost
from catboost import CatBoostClassifier
models['CatBoost']=CatBoostClassifier(silent=True)

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
models['AdaBoost']=AdaBoostClassifier()

# Multilayer Perceptron
from sklearn.neural_network import MLPClassifier
models['MLP']=MLPClassifier(random_state=0, early_stopping=True)

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC(class_weight='balanced', random_state = 0, max_iter = 2000)





In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    if key == "Support Vector Machines" or key == "MLP":
        scaler = MinMaxScaler()  
        scaler.fit(X_train)  
        X_train = scaler.transform(X_train)  
        X_test = scaler.transform(X_test) 
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.383733,0.487618,0.430568
Logistic Regression CV,0.384119,0.491334,0.431256
Decision Trees,0.32671,0.353806,0.359991
Random Forest,0.400191,0.589538,0.459899
Naive Bayes,0.336308,0.555941,0.341659
K-Nearest Neighbor,0.337932,0.373553,0.374198
Ridge Classifier,0.371488,0.594463,0.430797
XGBoost,0.398125,0.534855,0.453483
LGBM,0.411017,0.573697,0.464253
CatBoost,0.404042,0.54954,0.460357


### Wu

In [19]:
X=data[wu]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [20]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)


In [21]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.354452,0.507652,0.412236
Logistic Regression CV,0.348527,0.535295,0.406279
Support Vector Machines,0.355237,0.472685,0.359303
Decision Trees,0.295904,0.332837,0.335243
Random Forest,0.369182,0.52099,0.431714
Naive Bayes,0.363428,0.519763,0.430797
K-Nearest Neighbor,0.328958,0.370671,0.367324
Ridge Classifier,0.341477,0.617684,0.407195
MLP,0.33841,0.618018,0.413382


### Mencar

In [22]:
X=data[mencar]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [23]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [24]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.32717,0.470838,0.381072
Logistic Regression CV,0.306323,0.475824,0.36549
Support Vector Machines,0.211552,0.957352,0.230293
Decision Trees,0.286488,0.311643,0.318973
Random Forest,0.289141,0.325302,0.328139
Naive Bayes,0.322799,0.748628,0.417049
K-Nearest Neighbor,0.303898,0.349032,0.346471
Ridge Classifier,0.321396,0.532002,0.380385
MLP,0.349358,0.652753,0.414986


### Huang

In [25]:
X=data[huang]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [26]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train) 
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [27]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.34083,0.511773,0.403987
Logistic Regression CV,0.342891,0.5461,0.401008
Support Vector Machines,0.423209,0.657636,0.417049
Decision Trees,0.292635,0.322247,0.327452
Random Forest,0.296316,0.336766,0.338451
Naive Bayes,0.3633,0.744792,0.433089
K-Nearest Neighbor,0.322832,0.375753,0.371907
Ridge Classifier,0.341657,0.632695,0.399633
MLP,0.329668,0.719572,0.432172


### Ustun

In [28]:
X=data[ustun]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [29]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)


In [30]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.347905,0.533034,0.410174
Logistic Regression CV,0.345622,0.541032,0.406966
Support Vector Machines,0.491574,0.93355,0.263291
Decision Trees,0.297054,0.330455,0.333639
Random Forest,0.329145,0.412317,0.381302
Naive Bayes,0.38218,0.575815,0.423694
K-Nearest Neighbor,0.320618,0.358086,0.357699
Ridge Classifier,0.350226,0.67355,0.40857
MLP,0.369962,0.752395,0.439734


### Rodruiges

In [31]:
X=data[rodruiges]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [32]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)


In [33]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.358537,0.508651,0.413841
Logistic Regression CV,0.351643,0.544532,0.405133
Support Vector Machines,0.235793,1.0,0.235793
Decision Trees,0.315634,0.353524,0.355637
Random Forest,0.359687,0.525303,0.426673
Naive Bayes,0.37382,0.513899,0.430797
K-Nearest Neighbor,0.333937,0.378249,0.376031
Ridge Classifier,0.34878,0.625472,0.406049
MLP,0.314264,0.838321,0.409716


### Demographic

In [34]:
X=data[demographic]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [35]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [36]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.317555,0.52545,0.380156
Logistic Regression CV,0.329903,0.545855,0.380156
Support Vector Machines,0.409735,0.661222,0.360678
Decision Trees,0.285669,0.311106,0.317369
Random Forest,0.287597,0.319611,0.322869
Naive Bayes,0.349518,0.629289,0.412924
K-Nearest Neighbor,0.307719,0.346075,0.344638
Ridge Classifier,0.303715,0.640059,0.375802
MLP,0.372866,0.777488,0.429652


### Measurements

In [37]:
X=data[measurements]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [38]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')



In [39]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.332871,0.486106,0.381302
Logistic Regression CV,0.323828,0.508997,0.377864
Support Vector Machines,0.318348,0.561348,0.386114
Decision Trees,0.298096,0.33258,0.335243
Random Forest,0.340989,0.501589,0.412924
Naive Bayes,0.340774,0.47704,0.402841
K-Nearest Neighbor,0.308274,0.349211,0.347388
Ridge Classifier,0.332933,0.549148,0.382676
MLP,0.343658,0.640697,0.42736


### Comorbidities

In [40]:
X=data[comorbidities]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [41]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)


In [42]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.309366,0.480083,0.296746
Logistic Regression CV,0.316813,0.516763,0.290101
Support Vector Machines,0.288614,0.581302,0.370761
Decision Trees,0.298928,0.447817,0.265811
Random Forest,0.300427,0.448147,0.275435
Naive Bayes,0.291162,0.662971,0.244042
K-Nearest Neighbor,0.278853,0.355243,0.304766
Ridge Classifier,0.307874,0.536911,0.288497
MLP,0.333911,0.939781,0.403071


### Symptoms

In [43]:
X=data[symptoms]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [44]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)


In [45]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.306705,0.497121,0.354262
Logistic Regression CV,0.298773,0.504151,0.347617
Support Vector Machines,0.288707,0.552216,0.388634
Decision Trees,0.298088,0.385342,0.328598
Random Forest,0.297235,0.381188,0.329743
Naive Bayes,0.284824,0.764886,0.410174
K-Nearest Neighbor,0.266828,0.351097,0.326764
Ridge Classifier,0.306791,0.528191,0.351742
MLP,0.326923,0.734839,0.417736


### Questionnaires

In [46]:
X=data[questionnaires]  # Features  
y=data['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

In [47]:
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)  
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = balanced_accuracy_score(predictions, y_test) 
    precision[key] = precision_score(predictions, y_test, average='weighted')
    recall[key] = recall_score(predictions, y_test, average='weighted')

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [48]:
df_model = pd.DataFrame(index=models.keys())
df_model['Balanced Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Balanced Accuracy,Precision,Recall
Logistic Regression,0.279451,0.375371,0.31439
Logistic Regression CV,0.272226,0.385391,0.305683
Support Vector Machines,0.33282,0.676412,0.37901
Decision Trees,0.25632,0.255149,0.259395
Random Forest,0.2587,0.270305,0.281393
Naive Bayes,0.326631,0.996691,0.402841
K-Nearest Neighbor,0.257686,0.304312,0.299038
Ridge Classifier,0.277921,0.385994,0.308891
MLP,0.23405,0.970207,0.400779
