In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('/content/gdrive/MyDrive/Sleep_health_and_lifestyle_dataset.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data['Sleep Disorder'].fillna('None', inplace=True)
data[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']] = imputer.fit_transform(data[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']])

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Gender', 'Occupation', 'BMI Category'], drop_first=True)

# Normalize numerical variables
scaler = StandardScaler()
data[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']] = scaler.fit_transform(data[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate']])

# Splitting the data (assuming X contains features and y contains target variable)
X = data.drop('Sleep Disorder', axis=1)
y = data['Sleep Disorder']


In [4]:
# Splitting the 'Blood Pressure' column into 'Systolic' and 'Diastolic' components
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True)

# Converting the new columns to numeric format
data['Systolic'] = pd.to_numeric(data['Systolic'])
data['Diastolic'] = pd.to_numeric(data['Diastolic'])

# Now, we drop the original 'Blood Pressure' column
data.drop('Blood Pressure', axis=1, inplace=True)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import numpy as np

# Extracting the independent variables and dependent variable
X = data[['Gender_Male', 'Age', 'Occupation_Sales Representative', 'Occupation_Software Engineer', 'Occupation_Teacher', 'Physical Activity Level', 'Stress Level', 'BMI Category_Overweight', 'BMI Category_Obese', 'Systolic', 'Diastolic', 'Heart Rate']]
y = data['Daily Steps']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



regressors = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "XGBRegressor": XGBRegressor(objective='reg:squarederror'),
    "AdaBoost Regressor": AdaBoostRegressor(),
}

# Train each regressor and make predictions
y_preds = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_preds[name] = regressor.predict(X_test)

# Compute mean of predictions from all regressors except CatBoost
all_preds = np.array(list(y_preds.values()))
y_pred_mean = np.mean(all_preds, axis=0)

# Round off the predicted 'Daily Steps' values to the nearest 100 factor
y_pred_rounded = np.round(y_pred_mean, -2)

# Evaluate the performance of the regressors
maes = {}
for name, y_pred in y_preds.items():
    mae = mean_absolute_error(y_test, y_pred)
    maes[name] = mae

# Print MAE for each regressor
for name, mae in maes.items():
    print(f"Mean Absolute Error for {name}: {mae}")


Mean Absolute Error for Linear Regression: 440.1070366393474
Mean Absolute Error for Lasso: 441.2399676235358
Mean Absolute Error for Ridge: 435.6088616725896
Mean Absolute Error for K-Neighbors Regressor: 124.53333333333333
Mean Absolute Error for Decision Tree: 26.666666666666668
Mean Absolute Error for Random Forest Regressor: 69.22666666666667
Mean Absolute Error for XGBRegressor: 43.9827734375
Mean Absolute Error for AdaBoost Regressor: 364.2874104840585


Decision Tree Produced the Least Mean Absolute Error

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Define the Decision Tree regressor
regressor_dt = DecisionTreeRegressor(random_state=42)

# Define the parameter grid
params = {
    'criterion': ['friedman_mse', 'absolute_error', 'poisson', 'squared_error'],  # Criterion for splitting
    'splitter': ['best', 'random'],  # Split strategy
    'max_features': ['auto', 'sqrt', 'log2', None],  # Number of features to consider
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=regressor_dt, param_grid=params, scoring='neg_mean_absolute_error', cv=5, verbose=1)

# Perform GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Mean Absolute Error:", -grid_search.best_score_)

# Get the best model
best_dt = grid_search.best_estimator_

# Predict using the best model
y_pred = best_dt.predict(X_test)

# Calculate Mean Absolute Error on test set
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error on Test Set:", mae)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Best Parameters: {'criterion': 'absolute_error', 'max_features': 'sqrt', 'splitter': 'random'}
Best Mean Absolute Error: 82.56497175141244
Mean Absolute Error on Test Set: 62.666666666666664


HyperParameter tuning has increased MAE so we drop it

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

# Define the DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state=42)

# Train the model
regressor_dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor_dt.predict(X_test)

# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Print the Mean Absolute Error
print("Mean Absolute Error for DecisionTreeRegressor:", mae)

Mean Absolute Error for DecisionTreeRegressor: 6.666666666666667


In [41]:
import pandas as pd

# Take user input for each feature
gender = int(input("Enter 1 for Male, 0 for Female: "))
age = int(input("Enter Age: "))
sales_rep = int(input("Enter 1 if Sales Representative, 0 otherwise: "))
software_engineer = int(input("Enter 1 if Software Engineer, 0 otherwise: "))
teacher = int(input("Enter 1 if Teacher, 0 otherwise: "))
physical_activity_level = int(input("Enter Physical Activity Level: "))
stress_level = int(input("Enter Stress Level: "))
overweight = int(input("Enter 1 if Overweight, 0 otherwise: "))
obese = int(input("Enter 1 if Obese, 0 otherwise: "))
systolic_bp = int(input("Enter Systolic Blood Pressure: "))
diastolic_bp = int(input("Enter Diastolic Blood Pressure: "))
heart_rate = int(input("Enter Heart Rate: "))

# Create a DataFrame with user input
user_df = pd.DataFrame({
    'Gender_Male': [gender],
    'Age': [age],
    'Occupation_Sales Representative': [sales_rep],
    'Occupation_Software Engineer': [software_engineer],
    'Occupation_Teacher': [teacher],
    'Physical Activity Level': [physical_activity_level],
    'Stress Level': [stress_level],
    'BMI Category_Overweight': [overweight],
    'BMI Category_Obese': [obese],
    'Systolic': [systolic_bp],
    'Diastolic': [diastolic_bp],
    'Heart Rate': [heart_rate]
})

# Make prediction for Daily Steps using RandomForestRegressor
y_pred_rf = regressor_dt.predict(user_df)

# Round off the predicted 'Daily Steps' values to the nearest 100 factor
y_pred_rounded = np.round(y_pred_rf, -2)

# Print the predicted daily steps
print("\n\nPredicted Daily Steps Required (Rounded off to nearest 100):", int(y_pred_rounded))


Enter 1 for Male, 0 for Female: 1
Enter Age: 30
Enter 1 if Sales Representative, 0 otherwise: 0
Enter 1 if Software Engineer, 0 otherwise: 1
Enter 1 if Teacher, 0 otherwise: 0
Enter Physical Activity Level: 30
Enter Stress Level: 30
Enter 1 if Overweight, 0 otherwise: 1
Enter 1 if Obese, 0 otherwise: 0
Enter Systolic Blood Pressure: 160
Enter Diastolic Blood Pressure: 68
Enter Heart Rate: 70
Predicted Daily Steps Required (Rounded off to nearest 100): 10000


  print("Predicted Daily Steps Required (Rounded off to nearest 100):", int(y_pred_rounded))
