## Viewing associations between severity and feautures using incorrectly classified samples (Random Forest Classification)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [2]:
data = pd.read_csv("OSA_complete_patients.csv")
# missing values in original df were replaced with 0 for categorical and average for continuous variables

### Train Test Split


In [3]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']
X=data[features]  # Features  
y=data['Severity']  # Labels

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

### Random Forest Classifier

In [5]:
clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 0)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4598991750687443


### Creating df of actual vs predicted results

In [6]:
accuracy_df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
#accuracy_df.to_csv("accuracy_df.csv")

In [7]:
accuracy_df.head()

Unnamed: 0,Actual,Predicted
2239,0,0
15290,1,3
12811,3,3
14762,0,1
19532,1,2


In [8]:
accuracy_df.shape

(4364, 2)

In [9]:
X_test.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Memory_problem,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale
2239,2.0,34.863792,0.0,0.0,0.0,168.0,86.0,37.0,97.0,120.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,15.0,2.0
15290,1.0,56.731006,0.0,0.0,1.0,157.0,78.0,40.0,98.0,125.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,19.0,24.0,13.0
12811,2.0,62.433949,0.0,0.0,0.0,150.0,55.0,34.0,92.0,135.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,21.0,17.0,0.0
14762,2.0,28.91718,0.0,0.0,0.0,163.0,89.0,41.0,115.0,120.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0
19532,2.0,55.748118,0.0,0.0,1.0,163.0,73.0,36.0,89.0,180.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,3.0,11.0,1.0


In [10]:
X_test.shape

(4364, 49)

### Merging X_test and accuracy dataframes 

In [11]:
feat_sev_all = X_test.join(accuracy_df)

In [12]:
feat_sev_all.shape

(4364, 51)

In [13]:
feat_sev_all.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
2239,2.0,34.863792,0.0,0.0,0.0,168.0,86.0,37.0,97.0,120.0,...,1.0,0.0,0.0,0.0,0.0,5.0,15.0,2.0,0,0
15290,1.0,56.731006,0.0,0.0,1.0,157.0,78.0,40.0,98.0,125.0,...,1.0,1.0,0.0,1.0,0.0,19.0,24.0,13.0,1,3
12811,2.0,62.433949,0.0,0.0,0.0,150.0,55.0,34.0,92.0,135.0,...,1.0,1.0,0.0,0.0,0.0,21.0,17.0,0.0,3,3
14762,2.0,28.91718,0.0,0.0,0.0,163.0,89.0,41.0,115.0,120.0,...,1.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0,1
19532,2.0,55.748118,0.0,0.0,1.0,163.0,73.0,36.0,89.0,180.0,...,1.0,1.0,0.0,0.0,1.0,3.0,11.0,1.0,1,2


### Extracting incorrect predictions

In [14]:
feat_sev_inc = feat_sev_all.drop(feat_sev_all[feat_sev_all.Actual == feat_sev_all.Predicted].index)

In [15]:
feat_sev_inc.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
15290,1.0,56.731006,0.0,0.0,1.0,157.0,78.0,40.0,98.0,125.0,...,1.0,1.0,0.0,1.0,0.0,19.0,24.0,13.0,1,3
14762,2.0,28.91718,0.0,0.0,0.0,163.0,89.0,41.0,115.0,120.0,...,1.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0,1
19532,2.0,55.748118,0.0,0.0,1.0,163.0,73.0,36.0,89.0,180.0,...,1.0,1.0,0.0,0.0,1.0,3.0,11.0,1.0,1,2
19991,2.0,55.145791,1.0,0.0,0.0,165.0,80.0,36.0,100.0,155.0,...,1.0,1.0,0.0,0.0,0.0,12.0,11.0,13.0,2,1
20495,1.0,35.605749,0.0,1.0,0.0,182.0,73.0,36.0,87.0,120.0,...,1.0,0.0,0.0,1.0,0.0,16.0,28.0,12.0,1,0


In [16]:
feat_sev_inc.shape

(2357, 51)

In [17]:
#feat_sev_inc.to_csv("incorrect_severity_all.csv")

In [18]:
#feat_sev_inc = pd.read_csv("incorrect_severity_all.csv") # use this when you close and reopen the notebook so you're working with the same data

### Subsetting into smaller dataframes by actual severity values

#### Severity = 0

In [19]:
inc_sev0 = feat_sev_inc.loc[feat_sev_inc['Actual'] == 0]

In [20]:
inc_sev0.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
14762,2.0,28.91718,0.0,0.0,0.0,163.0,89.0,41.0,115.0,120.0,...,1.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0,1
4348,1.0,47.15948,0.0,1.0,0.0,181.0,86.0,42.0,98.0,120.0,...,0.0,0.0,0.0,0.0,1.0,12.0,5.0,1.0,0,3
11439,2.0,24.366872,1.0,0.0,0.0,153.0,95.0,36.0,124.0,120.0,...,0.0,1.0,0.0,0.0,0.0,1.0,8.0,2.0,0,1
7295,1.0,34.151951,0.0,1.0,0.0,182.0,75.0,38.0,88.0,120.0,...,0.0,1.0,0.0,0.0,0.0,11.0,3.0,0.0,0,1
10661,1.0,43.953457,0.0,0.0,0.0,173.0,82.0,42.0,104.0,124.0,...,1.0,0.0,0.0,0.0,0.0,8.0,16.0,1.0,0,3


In [21]:
inc_sev0.shape

(434, 51)

#### Severity = 1

In [22]:
inc_sev1 = feat_sev_inc.loc[feat_sev_inc['Actual'] == 1]

In [23]:
inc_sev1.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
15290,1.0,56.731006,0.0,0.0,1.0,157.0,78.0,40.0,98.0,125.0,...,1.0,1.0,0.0,1.0,0.0,19.0,24.0,13.0,1,3
19532,2.0,55.748118,0.0,0.0,1.0,163.0,73.0,36.0,89.0,180.0,...,1.0,1.0,0.0,0.0,1.0,3.0,11.0,1.0,1,2
20495,1.0,35.605749,0.0,1.0,0.0,182.0,73.0,36.0,87.0,120.0,...,1.0,0.0,0.0,1.0,0.0,16.0,28.0,12.0,1,0
2463,2.0,56.596851,0.0,0.0,1.0,165.0,90.0,38.0,109.0,140.0,...,1.0,0.0,0.0,0.0,0.0,1.0,10.0,4.0,1,3
17921,2.0,69.683778,0.0,0.0,0.0,149.0,88.0,42.0,125.0,130.0,...,0.0,1.0,0.0,0.0,0.0,6.0,17.0,12.0,1,3


In [24]:
inc_sev1.shape

(689, 51)

#### Severity = 2

In [25]:
inc_sev2 = feat_sev_inc.loc[feat_sev_inc['Actual'] == 2]

In [26]:
inc_sev2.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
19991,2.0,55.145791,1.0,0.0,0.0,165.0,80.0,36.0,100.0,155.0,...,1.0,1.0,0.0,0.0,0.0,12.0,11.0,13.0,2,1
14269,1.0,66.669405,0.0,1.0,0.0,174.0,96.0,42.0,120.0,130.0,...,1.0,1.0,0.0,0.0,1.0,12.0,17.0,5.0,2,3
19377,2.0,58.414784,0.0,1.0,0.0,166.0,78.0,37.0,105.0,129.0,...,1.0,0.0,0.0,0.0,1.0,12.0,19.0,6.0,2,3
19625,2.0,67.063655,0.0,0.0,0.0,160.0,78.0,38.0,104.0,160.0,...,1.0,1.0,0.0,0.0,1.0,8.0,11.0,3.0,2,3
11742,1.0,59.975359,1.0,0.0,0.0,164.0,75.0,42.0,98.0,130.0,...,0.0,0.0,0.0,0.0,0.0,2.0,9.0,7.0,2,1


In [27]:
inc_sev2.shape

(896, 51)

#### Severity = 3

In [28]:
inc_sev3 = feat_sev_inc.loc[feat_sev_inc['Actual'] == 3]

In [29]:
inc_sev3.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
16716,1.0,39.164956,0.0,0.0,0.0,175.0,75.0,30.0,92.0,120.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,1.0,3,1
2226,1.0,47.041752,0.0,1.0,0.0,176.0,88.0,36.0,94.0,160.0,...,1.0,1.0,0.0,0.0,0.0,13.0,16.0,6.0,3,2
7605,2.0,40.479124,0.0,1.0,0.0,160.0,80.0,34.0,93.0,120.0,...,1.0,1.0,0.0,0.0,1.0,9.0,3.0,9.0,3,1
1852,2.0,61.338809,0.0,0.0,0.0,165.0,57.0,33.0,81.0,140.0,...,1.0,1.0,0.0,0.0,0.0,7.0,14.0,3.0,3,1
13448,2.0,46.499658,0.0,1.0,0.0,165.0,98.0,41.0,118.0,140.0,...,1.0,0.0,0.0,0.0,0.0,6.0,3.0,2.0,3,2


In [30]:
inc_sev3.shape

(338, 51)

## Association testing

#### Will be using the Kruskal-Wallis test to test association between continuous variables and predicted severity
null hypothesis: the median value for the continuous feature is the same for all severities (p-value <0.05 indicates you can reject null hypothesis)
test assumptions: 
    1. Samples are random samples, or allocation to treatment group is random.
    2. The two samples are mutually independent.
    3. The measurement scale is at least ordinal, and the variable is continuous.

#continuous = ['PatientID','Age','Height','Weight','Cervical_perimeter','Abdominal_perimeter','Systolic_BP','Diastolic_BP',
              'BMI','Epworth_scale','Pichots_scale','Depression_scale']

#### Spearman's correlation will be used to test association between categorical variables and predicted severity
null hypothesis: there is no association between the feature and the severity (p-value <0.05 indicates you can reject null hypothesis)
test assumptions: 
      1. Your two variables should be measured on an ordinal, interval or ratio scale.
      2. Your two variables represent paired observations.
      3. There is a monotonic relationship between the two variables. *** Check to make sure this assumption is met

### Kruskal Wallis

In [31]:
from scipy import stats

#### Creating a function to apply kruskall Wallis to continuous variables and return p-values in a list

In [32]:
continuous = ['Age','Height','Weight','Cervical_perimeter','Abdominal_perimeter','Systolic_BP','Diastolic_BP', 'BMI','Epworth_scale','Pichots_scale','Depression_scale']
def kruskall_pval(df):
    p_vals = []
    for feature in continuous:
        p_val = stats.kruskal(*[group[feature].values for name, group in df.groupby("Predicted")])[1]
        p_vals.append(p_val)
    return p_vals

In [33]:
sev0_inc_k_pvals = kruskall_pval(inc_sev0)
sev1_inc_k_pvals = kruskall_pval(inc_sev1)
sev2_inc_k_pvals = kruskall_pval(inc_sev2)
sev3_inc_k_pvals = kruskall_pval(inc_sev3)

In [34]:
sev0_inc_k_pvals

[1.748165280930878e-08,
 3.0162797038223118e-05,
 1.84483584185186e-15,
 6.628862706667371e-30,
 3.402837253403987e-22,
 2.231960646514679e-08,
 2.0109073996438527e-05,
 3.89185167693502e-12,
 0.6489928445801499,
 0.8970219138928452,
 0.9228404947089156]

### Chi square for categorical variables

In [35]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']

In [36]:
categorical = [i for i in features if i not in continuous]
categorical

['Sex',
 'Current_smoker',
 'Former_smoker',
 'Sedentary',
 'Maxillofacial_profile',
 'High_BP',
 'Asthma',
 'Rhinitis',
 'COPD',
 'Respiratory_fail',
 'Myocardial_infarct',
 'Coronary_fail',
 'Arrhythmias',
 'Stroke',
 'Heart_fail',
 'Arteriopathy',
 'Gastric_reflux',
 'Glaucoma',
 'Diabetes',
 'Hypercholesterolemia',
 'Hypertriglyceridemia',
 'Hypo(er)thyroidism',
 'Depression',
 'Obesity',
 'Dysmorphology',
 'Restless_Leg_Syndrome',
 'Snoring',
 'Diurnal_somnolence',
 'Driving_drowsiness',
 'Morning_fatigue',
 'Morning_headache',
 'Memory_problem',
 'Nocturnal_perspiration',
 'Shortness_of_breath_on_exertion',
 'Nocturia',
 'Drowsiness_accident',
 'Near_miss_accident',
 'Respiratory_arrest']

#### Function to extract p-values from chisq test

In [39]:
def chisq_pval(df):
    p_vals = []
    for feature in categorical:
        table = pd.crosstab(df[feature],df['Predicted'],margins = False)
        stat, p, dof, expected = chi2_contingency(table)
        p_vals.append(p)
       
    return p_vals

In [40]:
sev0_inc_x2_pvals = chisq_pval(inc_sev0)
sev1_inc_x2_pvals = chisq_pval(inc_sev1)
sev2_inc_x2_pvals = chisq_pval(inc_sev2)
sev3_inc_x2_pvals = chisq_pval(inc_sev3)

In [41]:
sev0_inc_x2_pvals

[3.034612278301051e-14,
 0.20571924160456442,
 0.0015766416363303332,
 0.8062120725487937,
 0.614602481580189,
 2.176307694985892e-10,
 0.30373208194652346,
 0.04094691852760459,
 0.31261084542094203,
 0.5160933714037523,
 0.032123520383921515,
 0.0012236175138951278,
 0.8533184336472855,
 0.4643910382161768,
 0.2648812071192575,
 0.010397565690135089,
 0.7052128609289572,
 0.704754855049156,
 0.009011066186067566,
 1.8288355957146607e-08,
 0.19429593913005416,
 0.6963102125263008,
 0.20974425799738008,
 0.2061299670066291,
 0.7664654163224374,
 0.40423912968074893,
 0.01712424648704245,
 0.8115227339289305,
 0.0034661670929311797,
 0.13811315459082285,
 0.07779249987070008,
 0.4668273857933902,
 0.22567488134742036,
 0.9001434042269736,
 0.0463893788268225,
 0.6671215424083714,
 0.07233269358275518,
 2.6062573180469985e-05]

### Combining continuous and categorical for each severity

In [42]:
sev0_inc_pvals = sev0_inc_k_pvals + sev0_inc_x2_pvals
sev1_inc_pvals = sev1_inc_k_pvals + sev1_inc_x2_pvals
sev2_inc_pvals = sev2_inc_k_pvals + sev2_inc_x2_pvals
sev3_inc_pvals = sev3_inc_k_pvals + sev3_inc_x2_pvals

### Combining lists into df

In [43]:
inc_sev_pvals = pd.DataFrame({'sev0_inc_pvals': sev0_inc_pvals,'sev1_inc_pvals': sev1_inc_pvals,
                              'sev2_inc_pvals': sev2_inc_pvals,'sev3_inc_pvals': sev3_inc_pvals})
# keep in mind that the severity here refers to the actual severity. So for sev0_inc, the actual severity is 0 and it was predicted incorrectly

In [44]:
inc_sev_pvals.head()

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals
0,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07
1,3.01628e-05,2.196219e-08,1.237035e-07,0.1059858
2,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792
3,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166
4,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847


In [45]:
# naming rows
rows = continuous+categorical
inc_sev_pvals.index = rows
inc_sev_pvals.head()

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07
Height,3.01628e-05,2.196219e-08,1.237035e-07,0.1059858
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847


## Applying association functions to correct predictions

In [46]:
feat_sev_cor = feat_sev_all.drop(feat_sev_all[feat_sev_all.Actual != feat_sev_all.Predicted].index)

In [47]:
feat_sev_cor.head()

Unnamed: 0,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,Systolic_BP,...,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Actual,Predicted
2239,2.0,34.863792,0.0,0.0,0.0,168.0,86.0,37.0,97.0,120.0,...,1.0,0.0,0.0,0.0,0.0,5.0,15.0,2.0,0,0
12811,2.0,62.433949,0.0,0.0,0.0,150.0,55.0,34.0,92.0,135.0,...,1.0,1.0,0.0,0.0,0.0,21.0,17.0,0.0,3,3
12503,1.0,43.84668,0.0,1.0,0.0,185.0,82.0,39.0,94.0,120.0,...,0.0,0.0,0.0,0.0,0.0,7.0,12.0,1.0,1,1
11816,1.0,58.154689,0.0,0.0,0.0,164.0,79.0,38.0,92.0,150.0,...,0.0,0.0,0.0,0.0,0.0,11.0,10.0,0.0,1,1
6096,1.0,50.080767,1.0,0.0,0.0,173.0,90.0,43.0,110.0,120.0,...,1.0,0.0,0.0,0.0,0.0,10.0,9.0,2.0,3,3


In [48]:
feat_sev_cor.shape

(2007, 51)

#### applying kruskal to correct severity df


In [49]:
sev_cor_k_pvals = kruskall_pval(feat_sev_cor)

In [50]:
sev_cor_k_pvals
# The age result is interesting. Ask about why it's 0.0

[3.9815836149045e-80,
 9.580151609118055e-28,
 9.285852305158433e-94,
 2.9575963855617865e-169,
 4.5033414035686724e-133,
 3.8043737607255617e-66,
 2.6163297682684056e-31,
 3.082616133281036e-74,
 0.09000224309128312,
 0.0001765798819432906,
 0.012625747104111849]

In [51]:
stats.kruskal(*[group["Age"].values for name, group in feat_sev_cor.groupby("Actual")])

KruskalResult(statistic=371.1205673663685, pvalue=3.9815836149045e-80)

#### applying chisq to correct severity df

In [52]:
sev_cor_x2_pvals = chisq_pval(feat_sev_cor)

In [53]:
sev_cor_x2_pvals

[1.51967221166611e-77,
 0.12193351031880435,
 1.169910421864382e-13,
 0.0018169104803985455,
 0.0028377562811755354,
 4.0590673574185076e-41,
 0.0003359544446534844,
 8.993464888194076e-05,
 9.361663271945843e-05,
 0.12707989916274923,
 0.0007368073134812016,
 1.4810666909615257e-06,
 0.000898502486779486,
 0.0054220859098905875,
 0.0034774873407779116,
 0.009893533307348351,
 6.130866560171082e-06,
 0.8447789454368115,
 3.4509738384859436e-14,
 6.114418382943307e-21,
 6.737495310837107e-06,
 0.37432576538548784,
 0.3480537908110146,
 0.05178599675455684,
 0.789505363625075,
 0.9795757369547972,
 9.926218097975187e-05,
 0.35462968581318244,
 3.7015583618124345e-06,
 1.6508079058574976e-05,
 3.4080829727631755e-19,
 0.00032025691639306304,
 2.0284911361479815e-06,
 0.1030733693696132,
 4.646817177697524e-12,
 0.07539633349244074,
 0.17569165800664355,
 2.21550394971003e-42]

#### combining correct severity pval lists

In [55]:
sev_cor_pvals = sev_cor_k_pvals + sev_cor_x2_pvals

#### adding sev_cor_pval to dataframe containing incorrect severity pvals

In [56]:
all_sev_p_vals = inc_sev_pvals
all_sev_p_vals["sev_cor_pval"]= sev_cor_pvals 

In [59]:
all_sev_p_vals.head()

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07,3.9815839999999996e-80
Height,3.01628e-05,2.196219e-08,1.237035e-07,0.1059858,9.580152e-28
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792,9.285852e-94
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166,2.957596e-169
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847,4.5033409999999995e-133


In [60]:
#all_sev_p_vals.to_csv("all_sev_p_vals.csv")
#all_sev_p_vals = pd.read_csv("all_sev_p_vals.csv", index_col=0)

### Top 5 most important features for each category

In [61]:
all_sev_p_vals.nsmallest(5,'sev0_inc_pvals')

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.023022,2.957596e-169
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.989485,4.5033409999999995e-133
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.895479,9.285852e-94
Sex,3.034612e-14,7.158049e-22,1.404525e-17,0.053396,1.5196719999999999e-77
BMI,3.891852e-12,1.464682e-23,1.21427e-19,0.655409,3.082616e-74


In [63]:
all_sev_p_vals.nsmallest(5,'sev1_inc_pvals')

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166,2.957596e-169
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847,4.5033409999999995e-133
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792,9.285852e-94
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07,3.9815839999999996e-80
BMI,3.891852e-12,1.464682e-23,1.21427e-19,0.6554091,3.082616e-74


In [64]:
all_sev_p_vals.nsmallest(5,'sev2_inc_pvals')

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166,2.957596e-169
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847,4.5033409999999995e-133
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07,3.9815839999999996e-80
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792,9.285852e-94
Systolic_BP,2.231961e-08,5.679014e-19,1.118491e-23,4.961007e-08,3.8043740000000004e-66


In [62]:
all_sev_p_vals.nsmallest(5,'sev3_inc_pvals')

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Systolic_BP,2.231961e-08,5.679014e-19,1.118491e-23,4.961007e-08,3.8043740000000004e-66
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07,3.9815839999999996e-80
Diastolic_BP,2.010907e-05,9.943009e-11,1.459947e-08,9.590243e-05,2.6163300000000003e-31
Hypercholesterolemia,1.828836e-08,5.179547e-07,9.364602e-07,0.00300466,6.114418e-21
Former_smoker,0.001576642,3.308032e-06,6.292396e-06,0.003703465,1.16991e-13


In [None]:
len(sev3_inc_pvals)

In [66]:
all_sev_p_vals.nsmallest(5,'sev_cor_pval')

Unnamed: 0,sev0_inc_pvals,sev1_inc_pvals,sev2_inc_pvals,sev3_inc_pvals,sev_cor_pval
Cervical_perimeter,6.628862999999999e-30,1.164019e-46,1.557529e-46,0.02302166,2.957596e-169
Abdominal_perimeter,3.4028370000000004e-22,3.786483e-40,2.020194e-41,0.9894847,4.5033409999999995e-133
Weight,1.844836e-15,6.2339870000000005e-27,1.323538e-25,0.8954792,9.285852e-94
Age,1.748165e-08,2.192539e-25,8.548509e-32,5.46729e-07,3.9815839999999996e-80
Sex,3.034612e-14,7.158049e-22,1.404525e-17,0.05339568,1.5196719999999999e-77
