# Sleep Efficiency

## Importing required packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Getting datasets

In [10]:
df = pd.read_csv("data/proj80/Sleep_Efficiency.csv")
df.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,10,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,24,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,28,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


## Cleaning dataset

In [11]:
df.isna().sum()

ID                         0
Age                        0
Gender                     0
Bedtime                    0
Wakeup time                0
Sleep duration             0
Sleep efficiency           0
REM sleep percentage       0
Deep sleep percentage      0
Light sleep percentage     0
Awakenings                20
Caffeine consumption      25
Alcohol consumption       16
Smoking status             0
Exercise frequency         6
dtype: int64

In [13]:
df["Awakenings"].fillna(np.mean(df["Awakenings"]), inplace=True)
df["Caffeine consumption"].fillna(np.mean(df["Caffeine consumption"]), inplace=True)
df["Alcohol consumption"].fillna(np.mean(df["Alcohol consumption"]), inplace=True)

In [14]:
df.isna().sum()

ID                        0
Age                       0
Gender                    0
Bedtime                   0
Wakeup time               0
Sleep duration            0
Sleep efficiency          0
REM sleep percentage      0
Deep sleep percentage     0
Light sleep percentage    0
Awakenings                0
Caffeine consumption      0
Alcohol consumption       0
Smoking status            0
Exercise frequency        6
dtype: int64

## Pre-Processing dataset

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              452 non-null    float64
 11  Caffeine consumption    452 non-null    float64
 12  Alcohol consumption     452 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [16]:
df.drop(["Bedtime", "Wakeup time"], axis=1, inplace=True)

In [17]:
labels = {}
for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        labels[column] = le.classes_
        

In [18]:
df.head()

Unnamed: 0,ID,Age,Gender,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,6.0,0.88,18,70,10,0.0,0.0,0.0,1,3.0
1,2,69,1,7.0,0.66,24,28,53,3.0,0.0,3.0,1,3.0
2,3,40,0,8.0,0.89,20,70,10,1.0,0.0,0.0,0,3.0
3,4,40,0,6.0,0.51,28,25,52,3.0,50.0,5.0,1,1.0
4,5,57,1,8.0,0.76,27,55,18,3.0,0.0,3.0,0,3.0


In [25]:
df.drop("ID", inplace=True, axis=1)

X = df.drop("Sleep efficiency", axis=1)
y = df["Sleep efficiency"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     452 non-null    int64  
 1   Gender                  452 non-null    int64  
 2   Sleep duration          452 non-null    float64
 3   Sleep efficiency        452 non-null    float64
 4   REM sleep percentage    452 non-null    int64  
 5   Deep sleep percentage   452 non-null    int64  
 6   Light sleep percentage  452 non-null    int64  
 7   Awakenings              452 non-null    float64
 8   Caffeine consumption    452 non-null    float64
 9   Alcohol consumption     452 non-null    float64
 10  Smoking status          452 non-null    int64  
 11  Exercise frequency      446 non-null    float64
dtypes: float64(6), int64(6)
memory usage: 42.5 KB


In [28]:
labels

{'Gender': array(['Female', 'Male'], dtype=object),
 'Smoking status': array(['No', 'Yes'], dtype=object)}

## Training model

In [27]:
xreg = XGBRegressor()
xreg.fit(X_train.values, y_train.values)
xreg.score(X_test.values, y_test.values)

0.8526822137342804

## Prediction

In [31]:
xreg.predict([[18, 1, 6, 20, 60, 20, 3, 0.0, 0.0, 0, 2]])

array([0.8254941], dtype=float32)