In [1]:
# import libraries

# pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.tree import plot_tree, export_text

In [2]:
df = pd.read_csv('Sleep_Data_Sampled.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,35,Doctor,6.65,7,50,7,Normal Weight,120/80,71,7100,Healthy
1,2,Male,42,Teacher,6.9,8,52,4,Normal,135/90,66,7000,Healthy
2,3,Male,34,Software Engineer,6.95,7,66,6,Overweight,126/83,74,6100,Healthy
3,4,Male,32,Doctor,6.9,6,52,7,Normal,120/80,71,6500,Healthy
4,5,Male,37,Lawyer,6.85,7,60,6,Normal,125/80,71,6500,Healthy


In [3]:
print(df.columns)

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')


In [6]:
df.isnull().sum()

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64

In [8]:
df = pd.get_dummies(df, columns=['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder'], drop_first=True)

In [9]:
print(df.columns)

Index(['Person ID', 'Age', 'Sleep Duration', 'Quality of Sleep',
       'Physical Activity Level', 'Stress Level', 'Blood Pressure',
       'Heart Rate', 'Daily Steps', 'Gender_Male', 'Occupation_Doctor',
       'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager',
       'Occupation_Nurse', 'Occupation_Sales Representative',
       'Occupation_Salesperson', 'Occupation_Scientist',
       'Occupation_Software Engineer', 'Occupation_Teacher',
       'BMI Category_Normal Weight', 'BMI Category_Obese',
       'BMI Category_Overweight', 'Sleep Disorder_Insomnia',
       'Sleep Disorder_Sleep Apnea'],
      dtype='object')


In [13]:
# establish variables

independent_variables = ['Person ID', 'Age', 'Sleep Duration',
       'Physical Activity Level', 'Stress Level',
       'Heart Rate', 'Daily Steps', 'Gender_Male', 'Occupation_Doctor',
       'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Manager',
       'Occupation_Nurse', 'Occupation_Sales Representative',
       'Occupation_Salesperson', 'Occupation_Scientist',
       'Occupation_Software Engineer', 'Occupation_Teacher',
       'BMI Category_Normal Weight', 'BMI Category_Obese',
       'BMI Category_Overweight', 'Sleep Disorder_Insomnia',
       'Sleep Disorder_Sleep Apnea']

dependent_variable = 'Quality of Sleep'

In [14]:
# define X and y
X = df[independent_variables]

y = df[dependent_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=45)

In [15]:
# initialize an empty random forest model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(oob_score=True)

model.fit(X_train, y_train)

In [16]:
# now lets evaluate our model
y_pred = model.predict(X_test)

# eval model 
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, average='weighted')
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, average='weighted')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')
print('F1 Score: %f' % f1)

# # calculate predicted probabilities
# y_pred_proba = model.predict_proba(X_test)

# # keep only the proba for True
# y_pred_proba = y_pred_proba[:,1]

# # compute auc score
# auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
# print('AUC Score: %f' % auc)

# produce classification peport
print(classification_report(y_test, y_pred))

Accuracy Score: 0.994667
Precision Score: 0.994685
Recall Score: 0.994667
F1 Score: 0.994665
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         8
           5       1.00      0.98      0.99        65
           6       1.00      1.00      1.00      1042
           7       0.99      0.99      0.99       505
           8       0.99      1.00      0.99      1171
           9       1.00      1.00      1.00       209

    accuracy                           0.99      3000
   macro avg       1.00      0.99      1.00      3000
weighted avg       0.99      0.99      0.99      3000



In [17]:
print('oob accuracy', model.oob_score_)
print('oob error rate', 1 - model.oob_score_)

oob accuracy 0.99475
oob error rate 0.005249999999999977


In [18]:
print(np.sqrt(64))
print(np.log2(64))

8.0
6.0


In [19]:
feature_imp = pd.Series(model.feature_importances_,index=independent_variables).sort_values(ascending=False)
feature_imp

Sleep Duration                     0.271844
Stress Level                       0.175918
Age                                0.115467
Heart Rate                         0.098933
Physical Activity Level            0.077621
Daily Steps                        0.074700
Person ID                          0.048517
Sleep Disorder_Insomnia            0.037678
Occupation_Salesperson             0.018701
BMI Category_Overweight            0.012831
Sleep Disorder_Sleep Apnea         0.011826
Gender_Male                        0.010124
Occupation_Nurse                   0.008398
Occupation_Teacher                 0.008201
Occupation_Doctor                  0.006762
Occupation_Engineer                0.006549
Occupation_Lawyer                  0.004576
BMI Category_Obese                 0.003908
BMI Category_Normal Weight         0.002582
Occupation_Scientist               0.002295
Occupation_Sales Representative    0.001816
Occupation_Software Engineer       0.000603
Occupation_Manager              