## Imputing missing values and seeing how it affects RF classifier accuracy

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv("OSA_simple_nadrop.csv", index_col=0) # This file was created from OFSP_English_Amonae_09May.pynb  
data.shape

(80390, 51)

In [3]:
# percentage of each column that is missing data
(data.isnull().sum()/len(data))*100

PatientID                           0.000000
Sex                                 0.297301
Age                                 0.000000
Current_smoker                      0.206493
Former_smoker                       0.201518
Sedentary                           0.339594
Height                              2.191815
Weight                              1.872123
Cervical_perimeter                 46.999627
Abdominal_perimeter                46.326658
Systolic_BP                        53.480532
Diastolic_BP                       53.767882
Maxillofacial_profile              15.643737
BMI                                 2.619729
High_BP                             0.482647
Asthma                              0.388108
Rhinitis                            0.327155
COPD                                0.664262
Respiratory_fail                    0.383132
Myocardial_infarct                  0.237592
Coronary_fail                       0.354522
Arrhythmias                         0.430402
Stroke    

In [3]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']
continuous = ['Age','Height','Weight','Cervical_perimeter','Abdominal_perimeter','Systolic_BP','Diastolic_BP', 
              'BMI','Epworth_scale','Pichots_scale','Depression_scale']
categorical = [i for i in features if i not in continuous]

## Simple imputer- sklearn

### Mean continuous/ 0 categorical

** Sex is on a scale of 1-2, replacing with 0 will not be possible for Sex

In [5]:
mean0 = data.copy(deep= True)           # copy original df
imp_cont = SimpleImputer(strategy='mean', missing_values=np.nan)  # define imputer for continuous variables
imp_cont = imp_cont.fit(mean0[continuous])                    # fit imputer on columns
mean0[continuous] = imp_cont.transform(mean0[continuous])     # transform columns

imp_cat = SimpleImputer(strategy='constant', missing_values=np.nan, fill_value = 0)  # define imputer for categorical variables
imp_cat = imp_cat.fit(mean0[categorical])                                  # fit imputer on columns
mean0[categorical] = imp_cat.transform(mean0[categorical])                 # transform columns

In [6]:
# checking to ensure simple imputer worked
mean0.isnull().sum().sum()

0

In [7]:
mean0.head()

Unnamed: 0,PatientID,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,...,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Severity
0,1,2.0,35.167693,0.0,1.0,0.0,166.0,100.0,35.0,114.0,...,1.0,1.0,1.0,0.0,0.0,1.0,6.0,24.0,10.0,1
6,3,1.0,35.014374,0.0,0.0,0.0,178.0,96.0,44.0,109.0,...,1.0,0.0,1.0,0.0,0.0,0.0,9.0,2.0,1.0,3
7,4,1.0,48.80219,0.0,1.0,1.0,182.0,81.0,38.0,96.0,...,0.0,0.0,1.0,0.0,0.0,1.0,10.0,10.0,5.0,2
20,14,2.0,56.410678,0.0,0.0,0.0,165.0,88.0,38.0,110.0,...,1.0,0.0,1.0,0.0,0.0,0.0,4.0,15.0,6.0,3
21,15,1.0,71.600274,0.0,0.0,0.0,175.0,77.0,40.0,101.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,2.0,2


### Running random forest on mean0 dataframe

In [8]:
X=mean0[features]  # Features  
y=mean0['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

In [9]:
clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4707675083965667


In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.29      0.35      2284
           1       0.34      0.25      0.28      3524
           2       0.27      0.09      0.14      3560
           3       0.53      0.85      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.37      0.36     16078
weighted avg       0.42      0.47      0.41     16078



### Mean continuous/ most frequent categorical

In [11]:
meanf = data.copy(deep= True)            # copy original df
imp_cont = SimpleImputer(strategy='mean', missing_values=np.nan)  # define imputer for continuous variables
imp_cont = imp_cont.fit(meanf[continuous])                    # fit imputer on columns
meanf[continuous] = imp_cont.transform(meanf[continuous])     # transform columns

imp_cat = SimpleImputer(strategy='most_frequent', missing_values=np.nan)  # define imputer for categorical variables
imp_cat = imp_cat.fit(meanf[categorical])                                  # fit imputer on columns
meanf[categorical] = imp_cat.transform(meanf[categorical])                 # transform columns

In [12]:
meanf['Systolic_BP'].isnull().sum()

0

### RF on meanf df

In [13]:
X=meanf[features]  # Features  
y=meanf['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.46964796616494586


In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.28      0.33      2284
           1       0.33      0.24      0.28      3524
           2       0.29      0.10      0.15      3560
           3       0.53      0.85      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.37      0.35     16078
weighted avg       0.42      0.47      0.41     16078



### Median continuous/ 0 categorical

In [4]:
median0 = data.copy(deep= True)            # copy original df
imp_cont = SimpleImputer(strategy='median', missing_values=np.nan)  # define imputer for continuous variables
imp_cont = imp_cont.fit(median0[continuous])                    # fit imputer on columns
median0[continuous] = imp_cont.transform(median0[continuous])     # transform columns

imp_cat = SimpleImputer(strategy='constant', missing_values=np.nan, fill_value = 0)  # define imputer for categorical variables
imp_cat = imp_cat.fit(median0[categorical])                                  # fit imputer on columns
median0[categorical] = imp_cat.transform(median0[categorical])                 # transform columns

In [5]:
X=median0[features]  # Features  
y=median0['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.47375295434755565


In [6]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.29      0.34      2284
           1       0.35      0.25      0.29      3524
           2       0.28      0.09      0.14      3560
           3       0.53      0.85      0.66      6710

    accuracy                           0.47     16078
   macro avg       0.40      0.37      0.36     16078
weighted avg       0.42      0.47      0.42     16078



### Median continuous/ most frequent categorical

In [15]:
medianf = data.copy(deep= True)            # copy original df
imp_cont = SimpleImputer(strategy='median', missing_values=np.nan)  # define imputer for continuous variables
imp_cont = imp_cont.fit(medianf[continuous])                    # fit imputer on columns
medianf[continuous] = imp_cont.transform(medianf[continuous])     # transform columns

imp_cat = SimpleImputer(strategy='most_frequent', missing_values=np.nan)  # define imputer for categorical variables
imp_cat = imp_cat.fit(medianf[categorical])                                  # fit imputer on columns
medianf[categorical] = imp_cat.transform(medianf[categorical])                 # transform columns

In [16]:
X=medianf[features]  # Features  
y=medianf['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4705809180246299


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.42      0.28      0.34      2284
           1       0.33      0.24      0.28      3524
           2       0.29      0.09      0.14      3560
           3       0.53      0.85      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.37      0.35     16078
weighted avg       0.42      0.47      0.41     16078



In [18]:
# most important features for the medianf df using RF
pd.Series(clf.feature_importances_,index=features).sort_values(ascending=False) 

Age                                0.114936
BMI                                0.093298
Weight                             0.078053
Height                             0.070469
Epworth_scale                      0.064271
Pichots_scale                      0.059850
Depression_scale                   0.048016
Abdominal_perimeter                0.047620
Cervical_perimeter                 0.039822
Systolic_BP                        0.032283
Diastolic_BP                       0.029290
Shortness_of_breath_on_exertion    0.015642
Morning_headache                   0.014780
Morning_fatigue                    0.014617
Respiratory_arrest                 0.014344
Nocturnal_perspiration             0.013764
Former_smoker                      0.013321
Nocturia                           0.013053
Driving_drowsiness                 0.012907
High_BP                            0.012701
Current_smoker                     0.012199
Sex                                0.012162
Diurnal_somnolence              

## Multivariate feature imputation

#### A note: 
    from- https://scikit-learn.org/stable/modules/impute.html
    A more sophisticated approach is to use the IterativeImputer class, which models each feature with missing values as 
    a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: 
        at each step, a feature column is designated as output y and the other feature columns are treated as inputs X. 
        A regressor is fit on (X, y) for known y. Then, the regressor is used to predict the missing values of y. 
        This is done for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds. 
        The results of the final imputation round are returned.

### Bayesian regression * This is inappropriate. It treats all variables as numeric

In [19]:
mult = data.copy(deep= True)      # copy original df
imp_mult = IterativeImputer(max_iter=10, random_state=0)    # define iterative imputer. Default regression is Bayesian
imp_mult = imp_mult.fit(mult)                               # fit imputer
mult = imp_mult.transform(mult)                             # apply to data



In [20]:
mult_df = pd.DataFrame(mult, columns=data.columns) # changing from array back into a df
mult_df.to_csv("mult_df.csv")

In [21]:
mult_df.head()

Unnamed: 0,PatientID,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,...,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Severity
0,1.0,2.0,35.167693,0.0,1.0,0.0,166.0,100.0,35.0,114.0,...,1.0,1.0,1.0,0.0,0.0,1.0,6.0,24.0,10.0,1.0
1,3.0,1.0,35.014374,0.0,0.0,0.0,178.0,96.0,44.0,109.0,...,1.0,0.0,1.0,0.0,0.0,0.0,9.0,2.0,1.0,3.0
2,4.0,1.0,48.80219,0.0,1.0,1.0,182.0,81.0,38.0,96.0,...,0.0,0.0,1.0,0.0,0.0,1.0,10.0,10.0,5.0,2.0
3,14.0,2.0,56.410678,0.0,0.0,0.0,165.0,88.0,38.0,110.0,...,1.0,0.0,1.0,0.0,0.0,0.0,4.0,15.0,6.0,3.0
4,15.0,1.0,71.600274,0.0,0.0,0.0,175.0,77.0,40.0,101.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,2.0,2.0


In [22]:
mult_df.isnull().sum().sum() # how many missing values in df

0

In [23]:
X=mult_df[features]  # Features  
y=mult_df['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5279885557905212


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.53      0.39      0.45      2284
         1.0       0.40      0.35      0.38      3524
         2.0       0.36      0.13      0.19      3560
         3.0       0.59      0.88      0.70      6710

    accuracy                           0.53     16078
   macro avg       0.47      0.44      0.43     16078
weighted avg       0.49      0.53      0.48     16078



### RandomForest Regressor 

In [None]:
mult_rf = data.copy(deep= True)      # copy original df
imp_mult_rf = IterativeImputer(max_iter=10, random_state=0, estimator =  RandomForestRegressor())    # define iterative imputer. Default regression is Bayesian
imp_mult_rf = imp_mult_rf.fit(mult_rf)                               # fit imputer
mult_rf = imp_mult.transform(mult_rf)                             # apply to data
mult_rf = pd.DataFrame(mult_rf, columns=data.columns) 

In [None]:
mult_rf.isnull().sum().sum() # how many missing values in df
mult_rf.to_csv("mult_rf.csv")

In [None]:
#mult_rf.to_csv("mult_rf.csv")

In [None]:
X=mult_rf[features]  # Features  
y=mult_rf['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

### KNN Regressor

In [27]:
mult_kn = data.copy(deep= True)      # copy original df
imp_mult_kn = IterativeImputer(max_iter=10, random_state=0, estimator =  KNeighborsRegressor())    # define iterative imputer. Default regression is Bayesian
imp_mult_kn = imp_mult_kn.fit(mult_kn)                               # fit imputer
mult_kn = imp_mult_kn.transform(mult_kn)                             # apply to data
mult_kn = pd.DataFrame(mult_kn, columns=data.columns) 

mult_kn.to_csv("mult_kn.csv")

In [None]:
#mult_kn.to_csv("mult_kn.csv")

In [28]:
mult_kn.isnull().sum().sum() # how many missing values in df

0

In [29]:
X=mult_kn[features]  # Features  
y=mult_kn['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4692125886304267


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.42      0.28      0.34      2284
         1.0       0.33      0.23      0.27      3524
         2.0       0.27      0.07      0.12      3560
         3.0       0.53      0.87      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.36      0.35     16078
weighted avg       0.41      0.47      0.41     16078



## Nearest neighbors imputation

The KNNImputer class provides imputation for filling in missing values using the k-Nearest Neighbors approach. By default, a euclidean distance metric that supports missing values, nan_euclidean_distances, is used to find the nearest neighbors. Each missing feature is imputed using values from n_neighbors nearest neighbors that have a value for the feature. The feature of the neighbors are averaged uniformly or weighted by distance to each neighbor. If a sample has more than one feature missing, then the neighbors for that sample can be different depending on the particular feature being imputed. When the number of available neighbors is less than n_neighbors and there are no defined distances to the training set, the training set average for that feature is used during imputation. If there is at least one neighbor with a defined distance, the weighted or unweighted average of the remaining neighbors will be used during imputation. If a feature is always missing in training, it is removed during transform

### k = 2

In [5]:
knn2 = data.copy(deep= True)      # copy original df
imp_knn2 = KNNImputer(n_neighbors=2, weights="uniform")    # define iterative imputer. Default regression is Bayesian
imp_knn2 = imp_knn2.fit(knn2)                               # fit imputer
knn2 = imp_knn2.transform(knn2)                             # apply to data
knn2 = pd.DataFrame(knn2, columns=data.columns) 

knn2.to_csv("knn2.csv")

In [4]:
#knn2.to_csv("knn2.csv")

In [6]:
knn2.isnull().sum().sum() # how many missing values in df

0

In [7]:
X=knn2[features]  # Features  
y=knn2['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4684040303520338


In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.42      0.29      0.34      2284
         1.0       0.33      0.23      0.27      3524
         2.0       0.28      0.08      0.12      3560
         3.0       0.52      0.86      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.36      0.35     16078
weighted avg       0.41      0.47      0.41     16078



### k = 5

In [11]:
knn5 = data.copy(deep= True)      # copy original df
imp_knn5 = KNNImputer(n_neighbors=5, weights="uniform")    # define iterative imputer. Default regression is Bayesian
imp_knn5 = imp_knn5.fit(knn5)                               # fit imputer
knn5 = imp_knn5.transform(knn5)                             # apply to data
knn5 = pd.DataFrame(knn5, columns=data.columns) 

In [12]:
knn5.to_csv("knn5.csv")

In [None]:
knn5.isnull().sum().sum() # how many missing values in df

In [13]:
X=knn5[features]  # Features  
y=knn5['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.47145167309366837


In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.42      0.28      0.33      2284
         1.0       0.33      0.23      0.27      3524
         2.0       0.29      0.08      0.12      3560
         3.0       0.53      0.87      0.66      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.37      0.35     16078
weighted avg       0.42      0.47      0.41     16078



### k = 10

In [9]:
knn10 = data.copy(deep= True)      # copy original df
imp_knn10 = KNNImputer(n_neighbors=10, weights="uniform")    # define iterative imputer. Default regression is Bayesian
imp_knn10 = imp_knn10.fit(knn10)                               # fit imputer
knn10 = imp_knn10.transform(knn10)                             # apply to data
knn10 = pd.DataFrame(knn10, columns=data.columns) 

knn10.to_csv("knn10.csv")

In [None]:
#knn7.to_csv("knn7.csv")

In [10]:
knn10.isnull().sum().sum() # how many missing values in df

0

In [11]:
X=knn10[features]  # Features  
y=knn10['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4689638014678443


In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.42      0.28      0.34      2284
         1.0       0.33      0.23      0.27      3524
         2.0       0.27      0.08      0.12      3560
         3.0       0.52      0.87      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.39      0.36      0.35     16078
weighted avg       0.41      0.47      0.41     16078



### Importing data created using MICE in R with PNN for continuous and logreg for categorical variables. max iterations = 5, 5 imputations

In [15]:
mice = pd.read_csv("MICE_m5_it5.csv", index_col=0)

In [16]:
mice.head()

Unnamed: 0,PatientID,Sex,Age,Current_smoker,Former_smoker,Sedentary,Height,Weight,Cervical_perimeter,Abdominal_perimeter,...,Nocturnal_perspiration,Shortness_of_breath_on_exertion,Nocturia,Drowsiness_accident,Near_miss_accident,Respiratory_arrest,Epworth_scale,Pichots_scale,Depression_scale,Severity
1,1,2,35.167693,0,1,0,166,100.0,35.0,114.0,...,1,1,1,0,0,1,6,24,10,1
2,3,1,35.014374,0,0,0,178,96.0,44.0,109.0,...,1,0,1,0,0,0,9,2,1,3
3,4,1,48.80219,0,1,1,182,81.0,38.0,96.0,...,0,0,1,0,0,1,10,10,5,2
4,14,2,56.410678,0,0,0,165,88.0,38.0,110.0,...,1,0,1,0,0,0,4,15,6,3
5,15,1,71.600274,0,0,0,175,77.0,40.0,101.0,...,0,0,0,0,0,0,9,3,2,2


In [17]:
mice.isnull().sum().sum()

0

In [22]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo.er.thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']
continuous = ['Age','Height','Weight','Cervical_perimeter','Abdominal_perimeter','Systolic_BP','Diastolic_BP', 
              'BMI','Epworth_scale','Pichots_scale','Depression_scale']
categorical = [i for i in features if i not in continuous]

In [23]:
X=mice[features]  # Features  
y=mice['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.47082970518721234


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.29      0.34      2284
           1       0.34      0.24      0.28      3524
           2       0.30      0.08      0.13      3560
           3       0.52      0.86      0.65      6710

    accuracy                           0.47     16078
   macro avg       0.40      0.37      0.35     16078
weighted avg       0.42      0.47      0.41     16078

