# Feature selection by category or reference paper
This tests all feature sets by classifier

In [108]:
import scipy
scipy.__version__

'1.7.3'

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2


In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("OSA_complete_patients.csv", index_col = 0)
# this data file was created using the RF_no_nan_data.ipynb file which should be in the missing value imputation folder

In [3]:
data.head()

Unnamed: 0,PatientID,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,...,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Severity
39,23,2.0,57.883641,0.0,0.0,0.0,172.0,90.0,45.0,125.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.0,4.0,5.0,3
41,24,2.0,60.796715,0.0,0.0,0.0,156.0,85.0,35.0,113.0,...,0.0,1.0,1.0,0.0,0.0,1.0,19.0,17.0,4.0,3
46,28,1.0,63.438741,0.0,0.0,0.0,178.0,68.0,35.0,73.0,...,0.0,0.0,1.0,0.0,0.0,1.0,5.0,3.0,0.0,1
55,32,1.0,28.736482,0.0,0.0,0.0,180.0,69.0,36.0,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0
56,33,1.0,56.80219,0.0,0.0,0.0,185.0,118.0,43.0,106.0,...,0.0,0.0,0.0,0.0,0.0,1.0,12.0,13.0,2.0,3


In [5]:
age_gender = data.groupby(['Severity', 'Sex']).agg({'Age': ['mean','min', 'max']})
age_gender

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max
Severity,Sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1.0,47.422456,18.297057,86.464066
0,2.0,45.231892,18.370979,85.730322
1,1.0,51.150186,18.220397,89.475702
1,2.0,49.89179,18.031485,90.800821
2,1.0,53.541034,18.376454,91.211499
2,2.0,53.678438,18.231348,88.224504
3,1.0,56.539941,18.880219,111.868583
3,2.0,57.994343,19.370294,91.964408


In [4]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']

X=data[features]  # Features  
y=data['Severity']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2,
                                                    random_state=0)

In [10]:
continuous = ['Age','Height','Weight','Cervical_perimeter','Abdominal_perimeter','Systolic_BP','Diastolic_BP', 
              'BMI','Epworth_scale','Pichots_scale','Depression_scale']
categorical = [i for i in features if i not in continuous]

### Correlation

In [5]:
correlations = data[data.columns[1:]].corr()['Severity'][:-1]  # Correlation of every column compared to Severity

In [42]:
corr_sort = pd.DataFrame(correlations.sort_values(ascending=False, key=abs)) 
corr_sort

Unnamed: 0,Severity
Cervical_perimeter,0.32853
Abdominal_perimeter,0.291886
Age,0.273949
Weight,0.234937
Sex,-0.220857
Respiratory_arrest,0.193334
BMI,0.18837
High_BP,0.173079
Systolic_BP,0.156997
Nocturia,0.124376


### Kruskal/ Chi

In [7]:
def kruskall_pval(df):
    p_vals = []
    for feature in continuous:
        p_val = stats.kruskal(*[group[feature].values for name, group in df.groupby("Severity")])[1]
        p_vals.append(p_val)
    return p_vals

In [11]:
k_pvals = kruskall_pval(data)

In [12]:
def chisq(df):
    p_vals = []
    for feature in categorical:
        table = pd.crosstab(df[feature],df['Severity'],margins = False)
        stat, p, dof, expected = chi2_contingency(table)
        p_vals.append(p)
       
    return p_vals

In [13]:
chisq_pvals = chisq(data)

In [14]:
all_pvals = k_pvals+ chisq_pvals
all_pvals = pd.DataFrame(all_pvals)
rows = continuous+categorical
all_pvals.index = rows
all_pvals

Unnamed: 0,0
Age,0.0
Height,4.4409939999999995e-65
Weight,7.317465000000001e-290
Cervical_perimeter,0.0
Abdominal_perimeter,0.0
Systolic_BP,1.986951e-195
Diastolic_BP,7.259439e-91
BMI,9.362688999999999e-224
Epworth_scale,3.773289e-13
Pichots_scale,2.117126e-21


In [41]:
chi_kru_sort = all_pvals.sort_values(by=[0], ascending = True)
chi_kru_sort

Unnamed: 0,0
Age,0.0
Cervical_perimeter,0.0
Abdominal_perimeter,0.0
Weight,7.317465000000001e-290
Sex,7.31781e-232
BMI,9.362688999999999e-224
Systolic_BP,1.986951e-195
Respiratory_arrest,2.4878409999999996e-184
High_BP,2.449168e-142
Diastolic_BP,7.259439e-91


### Random forest

In [15]:
clf=RandomForestClassifier(n_estimators=100, class_weight='balanced')  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

rf_importances = clf.feature_importances_   #importances

In [47]:
rf_importances_df = pd.DataFrame({'features': features, 'importances': rf_importances}).sort_values(by=['importances'],ascending=False)


In [48]:
rf_importances_df = rf_importances_df.set_index(['features'])
rf_importances_df

Unnamed: 0_level_0,importances
features,Unnamed: 1_level_1
Age,0.095393
BMI,0.076245
Abdominal_perimeter,0.071071
Weight,0.066501
Height,0.061503
Cervical_perimeter,0.060524
Pichots_scale,0.060179
Epworth_scale,0.058707
Systolic_BP,0.053471
Depression_scale,0.048104


### Catboost 

In [17]:
clf = CatBoostClassifier(n_estimators = 100, silent=True)
clf.fit(X_train,y_train)                   # training classifier


<catboost.core.CatBoostClassifier at 0x217f10d8bb0>

In [28]:
cat_importances = clf.get_feature_importance()
cat_importances_df = pd.DataFrame({'features': features, 'importances': cat_importances}).sort_values(by=['importances'],ascending=False)

In [49]:
cat_importances_df = cat_importances_df.set_index(['features'])
cat_importances_df

Unnamed: 0_level_0,importances
features,Unnamed: 1_level_1
Age,13.216816
Cervical_perimeter,8.231543
BMI,7.374445
Diastolic_BP,7.241304
Epworth_scale,6.467688
Systolic_BP,6.354604
Abdominal_perimeter,6.055924
Height,5.359006
Weight,4.523434
Respiratory_arrest,4.509884


In [50]:
cat_importances_df.shape

(49, 1)

### Top 15s

In [52]:
corr_sort_15 = corr_sort.head(15)
corr_sort_15

Unnamed: 0,Severity
Cervical_perimeter,0.32853
Abdominal_perimeter,0.291886
Age,0.273949
Weight,0.234937
Sex,-0.220857
Respiratory_arrest,0.193334
BMI,0.18837
High_BP,0.173079
Systolic_BP,0.156997
Nocturia,0.124376


In [53]:
chi_kru_sort_15 = chi_kru_sort.head(15)
chi_kru_sort_15

Unnamed: 0,0
Age,0.0
Cervical_perimeter,0.0
Abdominal_perimeter,0.0
Weight,7.317465000000001e-290
Sex,7.31781e-232
BMI,9.362688999999999e-224
Systolic_BP,1.986951e-195
Respiratory_arrest,2.4878409999999996e-184
High_BP,2.449168e-142
Diastolic_BP,7.259439e-91


In [54]:
rf_importances_15 = rf_importances_df.head(15)
rf_importances_15

Unnamed: 0_level_0,importances
features,Unnamed: 1_level_1
Age,0.095393
BMI,0.076245
Abdominal_perimeter,0.071071
Weight,0.066501
Height,0.061503
Cervical_perimeter,0.060524
Pichots_scale,0.060179
Epworth_scale,0.058707
Systolic_BP,0.053471
Depression_scale,0.048104


In [55]:
cat_importances_15 = cat_importances_df.head(15)
cat_importances_15

Unnamed: 0_level_0,importances
features,Unnamed: 1_level_1
Age,13.216816
Cervical_perimeter,8.231543
BMI,7.374445
Diastolic_BP,7.241304
Epworth_scale,6.467688
Systolic_BP,6.354604
Abdominal_perimeter,6.055924
Height,5.359006
Weight,4.523434
Respiratory_arrest,4.509884


In [94]:
multi_imp = pd.concat([chi_kru_sort_15, corr_sort_15, rf_importances_15, cat_importances_15], axis = 1)
multi_imp.columns = ['chi2/kruskal', 'correlation', 'RandomForest', 'CatBoost']

In [95]:
multi_imp

Unnamed: 0,chi2/kruskal,correlation,RandomForest,CatBoost
Age,0.0,0.273949,0.095393,13.216816
Cervical_perimeter,0.0,0.32853,0.060524,8.231543
Abdominal_perimeter,0.0,0.291886,0.071071,6.055924
Weight,7.317465000000001e-290,0.234937,0.066501,4.523434
Sex,7.31781e-232,-0.220857,,1.957175
BMI,9.362688999999999e-224,0.18837,0.076245,7.374445
Systolic_BP,1.986951e-195,0.156997,0.053471,6.354604
Respiratory_arrest,2.4878409999999996e-184,0.193334,0.015524,4.509884
High_BP,2.449168e-142,0.173079,,
Diastolic_BP,7.259439e-91,0.11873,0.047907,7.241304


In [96]:
normalized_df=(multi_imp-multi_imp.min())/(multi_imp.max()-multi_imp.min())

In [97]:
normalized_df

Unnamed: 0,chi2/kruskal,correlation,RandomForest,CatBoost
Age,0.0,0.90065,1.0,1.0
Cervical_perimeter,0.0,1.0,0.573076,0.572449
Abdominal_perimeter,0.0,0.9333,0.7022,0.385862
Weight,4.813555999999999e-240,0.829642,0.646248,0.254432
Sex,4.813783e-182,0.0,,0.034342
BMI,6.15894e-174,0.744878,0.765561,0.498942
Systolic_BP,1.307051e-145,0.687774,0.486717,0.411478
Respiratory_arrest,1.636545e-134,0.753914,0.022093,0.253269
High_BP,1.611105e-92,0.717046,,
Diastolic_BP,4.775385e-41,0.618119,0.418594,0.487524


In [101]:
from sklearn.preprocessing import MinMaxScaler
scaled_features = MinMaxScaler(feature_range=(0.1, 1)).fit_transform(multi_imp.values)
scaled_features_df = pd.DataFrame(scaled_features, index=multi_imp.index, columns=multi_imp.columns)
scaled_features_df

Unnamed: 0,chi2/kruskal,correlation,RandomForest,CatBoost
Age,0.1,0.910585,1.0,1.0
Cervical_perimeter,0.1,1.0,0.615768,0.615204
Abdominal_perimeter,0.1,0.93997,0.73198,0.447276
Weight,0.1,0.846677,0.681623,0.328988
Sex,0.1,0.1,,0.130908
BMI,0.1,0.770391,0.789005,0.549048
Systolic_BP,0.1,0.718996,0.538045,0.47033
Respiratory_arrest,0.1,0.778523,0.119884,0.327943
High_BP,0.1,0.745341,,
Diastolic_BP,0.1,0.656307,0.476735,0.538771
