## This is to test how accurate RF is when using a dataset with no missing vlaues

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv("OSA_simple_nadrop.csv", index_col=0)
data.shape

(80390, 51)

In [3]:
no_nan_rows = data[~data.isnull().any(axis=1)]

In [4]:
no_nan_rows.shape

(21818, 51)

In [6]:
no_nan_rows.to_csv("OSA_complete_patients.csv")

In [5]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']

In [7]:
X=no_nan_rows[features]  # Features  
y=no_nan_rows['Severity']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

In [8]:
clf=RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.46241979835013747


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.23      0.30       572
           1       0.33      0.32      0.32      1007
           2       0.33      0.13      0.19      1029
           3       0.53      0.82      0.64      1756

    accuracy                           0.46      4364
   macro avg       0.41      0.37      0.36      4364
weighted avg       0.43      0.46      0.42      4364



In [10]:
pd.Series(clf.feature_importances_,index=features).sort_values(ascending=False)

Age                                0.095776
BMI                                0.075249
Abdominal_perimeter                0.071996
Weight                             0.065480
Height                             0.061235
Cervical_perimeter                 0.060640
Pichots_scale                      0.060355
Epworth_scale                      0.058949
Systolic_BP                        0.052736
Diastolic_BP                       0.048758
Depression_scale                   0.048103
Respiratory_arrest                 0.015716
Nocturnal_perspiration             0.013988
Nocturia                           0.013386
Driving_drowsiness                 0.013338
Sex                                0.012915
Shortness_of_breath_on_exertion    0.012871
Morning_headache                   0.012780
High_BP                            0.012263
Former_smoker                      0.011729
Obesity                            0.011702
Restless_Leg_Syndrome              0.010780
Morning_fatigue                 

#### Balanced subsample

In [10]:
clf=RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample', random_state = 42)  #defining classifier method

clf.fit(X_train,y_train)                   # training classifier

y_pred=clf.predict(X_test)                  # testing classifier 

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4598991750687443


In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.26      0.33       572
           1       0.33      0.29      0.31      1007
           2       0.30      0.13      0.18      1029
           3       0.53      0.82      0.64      1756

    accuracy                           0.46      4364
   macro avg       0.40      0.37      0.36      4364
weighted avg       0.42      0.46      0.42      4364

