In [41]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [42]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.head(30)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia
6,7,Male,29,Teacher,6.3,6,40,7,Obese,140/90,82,3500,Insomnia
7,8,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
8,9,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
9,10,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,


In [43]:
# Display the first few rows of the dataset to understand its structure
print(df.head())

# Check for missing values and handle them if necessary
print(df.isnull().sum())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80          75      

In [44]:
df.dropna(subset=['Sleep Disorder'], inplace=True)

# Check the shape of the dataset after dropping rows
print("Shape of the dataset after dropping rows with NaN in sleep disorder column:", df.shape)

Shape of the dataset after dropping rows with NaN in sleep disorder column: (155, 13)


In [45]:
print(df['Sleep Disorder'])

3      Sleep Apnea
4      Sleep Apnea
5         Insomnia
6         Insomnia
16     Sleep Apnea
          ...     
369    Sleep Apnea
370    Sleep Apnea
371    Sleep Apnea
372    Sleep Apnea
373    Sleep Apnea
Name: Sleep Disorder, Length: 155, dtype: object


In [46]:
df.isnull().sum()

Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Blood Pressure             0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
dtype: int64

In [47]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,244.76129,46.632258,6.812258,6.870968,60.896774,5.767742,71.787097,6765.806452
std,91.282997,7.839311,0.773534,1.337325,20.634158,1.946757,5.187381,1893.921881
min,4.0,28.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,199.5,43.0,6.3,6.0,45.0,4.0,68.0,6000.0
50%,255.0,45.0,6.5,7.0,45.0,7.0,72.0,6000.0
75%,304.5,51.0,7.4,8.0,75.0,7.0,75.0,7000.0
max,374.0,59.0,8.3,9.0,90.0,8.0,86.0,10000.0


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 155 entries, 3 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                155 non-null    int64  
 1   Gender                   155 non-null    object 
 2   Age                      155 non-null    int64  
 3   Occupation               155 non-null    object 
 4   Sleep Duration           155 non-null    float64
 5   Quality of Sleep         155 non-null    int64  
 6   Physical Activity Level  155 non-null    int64  
 7   Stress Level             155 non-null    int64  
 8   BMI Category             155 non-null    object 
 9   Blood Pressure           155 non-null    object 
 10  Heart Rate               155 non-null    int64  
 11  Daily Steps              155 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 17.0+ KB


In [49]:
categorical_cols = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

print("Categorical Columns:")
for col in categorical_cols:
    print(col)

Categorical Columns:
Gender
Occupation
BMI Category
Blood Pressure
Sleep Disorder


In [50]:
# Separate features and target variable
X = df.drop(columns=['Person ID', 'Sleep Disorder'])
y = df['Sleep Disorder']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [51]:
print(preprocessor)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
       'Stress Level', 'Heart Rate', 'Daily Steps'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['Gender', 'Occupation', 'BMI Category', 'Blood Pressure'], dtype='object'))])


In [53]:
# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [54]:
# Initialize the Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Train the model using the preprocessed training data
model.fit(X_train_preprocessed, y_train)

# Predict on the preprocessed test data
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8709677419354839


In [55]:

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred, pos_label='Insomnia')

# Calculate recall
recall = recall_score(y_test, y_pred, pos_label='Insomnia')

# Calculate F1-score
f1 = f1_score(y_test, y_pred, pos_label='Insomnia')

# Encode target labels for ROC-AUC score calculation
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_pred_encoded = le.transform(y_pred)

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test_encoded, y_pred_encoded)

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC score:", roc_auc)

Accuracy: 0.8709677419354839
Precision: 0.8571428571428571
Recall: 0.8571428571428571
F1-score: 0.8571428571428571
ROC-AUC score: 0.8697478991596639


In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the model
model = RandomForestClassifier()

# Define the parameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_preprocessed, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 300}
Best accuracy found:  0.8876666666666667


# Conclusion
The evaluation of the model's performance on the test dataset yields promising results. The accuracy of the model stands at approximately 87%, indicating that the model correctly predicts the 'Sleep Disorder' for the majority of cases. Further evaluation metrics, including precision, recall, and F1-score, all hover around 85.7%, which suggests a balanced performance in identifying true positives and minimizing false positives and negatives for the 'Insomnia' class. The ROC-AUC score of approximately 0.87 signifies that the model has a strong ability to distinguish between the classes.

The hyperparameter tuning using GridSearchCV has identified the best parameters for the Random Forest model, which could further enhance its performance. Overall, these results demonstrate that the model is effective and reliable for predicting sleep disorders, particularly insomnia, making it a valuable tool for healthcare professionals. Future work may focus on exploring additional features, trying different algorithms, and further fine-tuning to achieve even better performance.