In [95]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings('ignore')

In [96]:
df = pd.read_csv('./data/Sleep_Efficiency.csv')

In [97]:
df.head(10)

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,10,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,24,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,28,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0
5,6,36,Female,2021-07-01 21:00:00,2021-07-01 04:30:00,7.5,0.9,28,60,17,0.0,,0.0,No,1.0
6,7,27,Female,2021-07-21 21:00:00,2021-07-21 03:00:00,6.0,0.54,28,25,52,2.0,50.0,0.0,Yes,1.0
7,8,53,Male,2021-08-16 00:30:00,2021-08-16 10:30:00,10.0,0.9,28,57,20,0.0,50.0,0.0,Yes,3.0
8,9,41,Female,2021-04-05 02:30:00,2021-04-05 08:30:00,6.0,0.79,28,60,17,3.0,50.0,0.0,No,1.0
9,10,11,Female,2021-09-16 01:00:00,2021-09-16 10:00:00,9.0,0.55,18,35,45,4.0,0.0,3.0,Yes,0.0


In [98]:
df.drop(['ID', 'Bedtime', 'Wakeup time'], axis=1, inplace=True)

In [99]:
df.head(10)

Unnamed: 0,Age,Gender,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,65,Female,6.0,0.88,18,70,10,0.0,0.0,0.0,Yes,3.0
1,69,Male,7.0,0.66,24,28,53,3.0,0.0,3.0,Yes,3.0
2,40,Female,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,40,Female,6.0,0.51,28,25,52,3.0,50.0,5.0,Yes,1.0
4,57,Male,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0
5,36,Female,7.5,0.9,28,60,17,0.0,,0.0,No,1.0
6,27,Female,6.0,0.54,28,25,52,2.0,50.0,0.0,Yes,1.0
7,53,Male,10.0,0.9,28,57,20,0.0,50.0,0.0,Yes,3.0
8,41,Female,6.0,0.79,28,60,17,3.0,50.0,0.0,No,1.0
9,11,Female,9.0,0.55,18,35,45,4.0,0.0,3.0,Yes,0.0


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     452 non-null    int64  
 1   Gender                  452 non-null    object 
 2   Sleep duration          452 non-null    float64
 3   Sleep efficiency        452 non-null    float64
 4   REM sleep percentage    452 non-null    int64  
 5   Deep sleep percentage   452 non-null    int64  
 6   Light sleep percentage  452 non-null    int64  
 7   Awakenings              432 non-null    float64
 8   Caffeine consumption    427 non-null    float64
 9   Alcohol consumption     436 non-null    float64
 10  Smoking status          452 non-null    object 
 11  Exercise frequency      446 non-null    float64
dtypes: float64(6), int64(4), object(2)
memory usage: 42.5+ KB


In [101]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [102]:
X = df.drop('Sleep efficiency', axis=1)
y = df['Sleep efficiency']

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [104]:
num_features = X_train.select_dtypes(include='number').columns.tolist()
cat_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [110]:
num_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('number',      num_pipe, num_features),
    ('category',    cat_pipe, cat_features)
])

In [114]:
from xgboost import XGBRegressor

In [115]:
preprocessor.fit_transform(X_train)

array([[-0.93870413,  0.66069529,  1.82364858, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.80654183,  0.08234753, -0.73081279, ...,  1.        ,
         0.        ,  1.        ],
       [-1.62162647,  0.08234753,  0.80186403, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.26182338, -0.49600023, -0.73081279, ...,  1.        ,
         0.        ,  1.        ],
       [-0.02814102, -0.49600023,  1.31275631, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.11006287,  0.66069529,  0.29097176, ...,  0.        ,
         1.        ,  0.        ]])

In [117]:
full_model_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    #('pca', PCA(n_components=5)),
    ('reg', XGBRegressor())
])

full_model_pipe.fit(X_train, y_train)

In [119]:
y_pred = full_model_pipe.predict(X_train)
full_model_pipe.score(X_test, y_test)

0.8676343342388083

In [120]:
from sklearn.metrics import mean_squared_error

In [122]:
mean_squared_error(y_train, y_pred)

4.789290381329873e-06