In [69]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Load the preprocessed dataset
data = pd.read_csv("dataset/Sleep_Efficiency.csv")

# Separate features and target variable
X = data.drop(columns=["Sleep efficiency"])
y = data["Sleep efficiency"]

In [2]:
data.dtypes

ID                          int64
Age                         int64
Gender                     object
Bedtime                    object
Wakeup time                object
Sleep duration            float64
Sleep efficiency          float64
REM sleep percentage        int64
Deep sleep percentage       int64
Light sleep percentage      int64
Awakenings                float64
Caffeine consumption      float64
Alcohol consumption       float64
Smoking status             object
Exercise frequency        float64
dtype: object

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [4]:
data.shape

(452, 15)

In [5]:
data.describe(include="all")

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
count,452.0,452.0,452,452,452,452.0,452.0,452.0,452.0,452.0,432.0,427.0,438.0,452,446.0
unique,,,2,424,434,,,,,,,,,2,
top,,,Male,2021-03-11 01:00:00,2021-11-25 06:00:00,,,,,,,,,No,
freq,,,228,3,2,,,,,,,,,298,
mean,226.5,40.285398,,,,7.465708,0.788916,22.615044,52.823009,24.561947,1.641204,23.653396,1.173516,,1.79148
std,130.625419,13.17225,,,,0.866625,0.135237,3.525963,15.654235,15.313665,1.356762,30.202785,1.621377,,1.428134
min,1.0,9.0,,,,5.0,0.5,15.0,18.0,7.0,0.0,0.0,0.0,,0.0
25%,113.75,29.0,,,,7.0,0.6975,20.0,48.25,15.0,1.0,0.0,0.0,,0.0
50%,226.5,40.0,,,,7.5,0.82,22.0,58.0,18.0,1.0,25.0,0.0,,2.0
75%,339.25,52.0,,,,8.0,0.9,25.0,63.0,32.5,3.0,50.0,2.0,,3.0


In [73]:
# Assuming 'data' is your DataFrame containing the 'Gender' column

# Create a dictionary to map gender values
gender_mapping = {'Male': 1, 'Female': 0}

# Map the values in the 'Gender' column using the dictionary
data['Gender'] = data['Gender'].map(gender_mapping)


In [74]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
# Handling missing values and scaling numerical features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Encoding categorical features
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append preprocessing step to model
# For example, if you're using a linear regression model:
from sklearn.linear_model import LinearRegression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Now, you can fit the model to your training data
model.fit(X_train, y_train)

# Evaluate the model
# For example, if you're using a regression model:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.018636176631965887


In [7]:
# # Drop specified columns from features
# X = data.drop(columns=['Diabetes_012', 'Education', 'Income', 'DiffWalk'])

# # Separate the target variable
# y = data['Diabetes_012']

# # Now you can proceed with preprocessing the data, splitting it into training and testing sets, and further analysis.


In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Training the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_preprocessed, y_train)

# Predicting on the test set
y_pred = rf_regressor.predict(X_test_preprocessed)

# Evaluating the performance of the regressor
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.002750724945054951


In [80]:
from sklearn.metrics import r2_score

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)


R-squared: 0.8522550609627164


In [41]:
data['Sleep duration']

# Print the head 10 values along with their IDs
print(data[['ID', 'Sleep duration']].head(10))

   ID  Sleep duration
0   1             6.0
1   2             7.0
2   3             8.0
3   4             6.0
4   5             8.0
5   6             7.5
6   7             6.0
7   8            10.0
8   9             6.0
9  10             9.0


In [45]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Assuming 'data' is your DataFrame
data = pd.read_csv("dataset/Sleep_Efficiency.csv")

# Convert 'Gender' column: Male -> 1, Female -> 0
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Convert 'Smoking status' column: Yes -> 1, No -> 0
data['Smoking status'] = label_encoder.fit_transform(data['Smoking status'])

# Convert 'Sleep duration' to float minutes
data['Sleep duration'] = data['Sleep duration'] * 60  # Assuming 'Sleep duration' is in hours

# Drop 'Bedtime' and 'Wakeup time' columns
data.drop(columns=['Bedtime', 'Wakeup time'], inplace=True)

# Define features (X) and target variable (y)
X = data.drop(columns=['Sleep efficiency'])
y = data['Sleep efficiency']


X.head(5)

Unnamed: 0,ID,Age,Gender,Sleep duration,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,360.0,18,70,12,0.0,0.0,0.0,1,3.0
1,2,69,1,420.0,19,28,53,3.0,0.0,3.0,1,3.0
2,3,40,0,480.0,20,70,10,1.0,0.0,0.0,0,3.0
3,4,40,0,360.0,23,25,52,3.0,50.0,5.0,1,1.0
4,5,57,1,480.0,27,55,18,3.0,0.0,3.0,0,3.0


In [47]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Separate features (X) and target variable (y)
X = data.drop(columns=['ID', 'Sleep efficiency']).values  # Convert DataFrame to NumPy array
y = data['Sleep efficiency'].values

# Define numerical and categorical columns
numerical_indices = [0, 3, 4, 5, 6, 7, 8, 9, 10]  # Indices of numerical columns in X
categorical_indices = [2, 11]  # Indices of categorical columns in X

# Define preprocessing steps for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Scale features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder())  # One-hot encode categorical variables
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_indices),
        ('cat', categorical_transformer, categorical_indices)
    ])

# Apply preprocessing to the entire dataset
X_processed = preprocessor.fit_transform(X)

# Print the processed feature matrix shape
print("Processed feature matrix shape:", X_processed.shape)
print(X_processed[:5])


ValueError: all features must be in [0, 10] or [-11, 0]

In [35]:
from sklearn.preprocessing import MinMaxScaler

# Assuming 'data' is your DataFrame

# Replace values below 0.9 with 0 and values over 0.9 with 1 in the 'Sleep efficiency' column
data.loc[data['Sleep efficiency'] < 0.9, 'Sleep efficiency'] = 0
data.loc[data['Sleep efficiency'] >= 0.9, 'Sleep efficiency'] = 1

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Reshape the target variable y to ensure it's a 2D array
y_reshaped = data['Sleep efficiency'].values.reshape(-1, 1)

# Apply MinMaxScaler to y
y_scaled = scaler.fit_transform(y_reshaped)

# Convert the scaled y back to a 1D array
y = y_scaled.flatten()

In [16]:
print(y[:12])  # Print the first 5 elements of y


[0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1.]


In [17]:


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7582417582417582


In [27]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("class rep :", classification_report(y_test, y_pred))

Accuracy: 0.7362637362637363
class rep :               precision    recall  f1-score   support

         0.0       0.78      0.84      0.81        61
         1.0       0.62      0.53      0.57        30

    accuracy                           0.74        91
   macro avg       0.70      0.68      0.69        91
weighted avg       0.73      0.74      0.73        91



In [5]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for Random Forest and Decision Tree classifiers
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest and Decision Tree classifiers
rf_classifier = RandomForestClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV for Random Forest and Decision Tree classifiers
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
rf_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Predicting on the test set using best estimators from grid search
rf_y_pred = rf_grid_search.predict(X_test)
dt_y_pred = dt_grid_search.predict(X_test)

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))


Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

Random Forest Classifier:
Accuracy: 0.7227722772277227
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.81       140
           1       0.57      0.37      0.45        62

    accuracy                           0.72       202
   macro avg       0.67      0.62      0.63       202
weighted avg       0.70      0.72      0.70       202


Decision Tree Classifier:
Accuracy: 0.6584158415841584
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.74      0.75       140
           1       0.45      0.48      0.47        62

    accuracy                           0.66       202
   macro avg       0.61      0.61      0.61       202
weighted avg       

In [6]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))


Best parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.6}

XGBoost Classifier:
Accuracy: 0.7128712871287128
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81       140
           1       0.56      0.32      0.41        62

    accuracy                           0.71       202
   macro avg       0.65      0.60      0.61       202
weighted avg       0.69      0.71      0.69       202



In [7]:
# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
# print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


[LightGBM] [Info] Number of positive: 246, number of negative: 559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 805, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305590 -> initscore=-0.820818
[LightGBM] [Info] Start training from score -0.820818
[LightGBM] [Info] Number of positive: 246, number of negative: 559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 344
[LightGBM] [Info] Number of data points in the train set: 805, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305590 -> initscore=-0.820818
[LightGBM] [Info] 