## Check if the virtual environment is activated

In [1]:
import sys

# Print the location of the current Python interpreter
print(f"Python interpreter location: {sys.executable}")

Python interpreter location: C:\Users\mjb19\coding_projects\university_student_sleep_predictor\.venv\Scripts\python.exe


## Import the Required Libraries

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

import re

## Import the dataset

In [3]:
dataset_path = Path.cwd().parent / "dataset" / "student_sleep_patterns.csv"
dataset_path

WindowsPath('C:/Users/mjb19/coding_projects/university_student_sleep_predictor/dataset/student_sleep_patterns.csv')

In [4]:
# Load the dataset
df = pd.read_csv(dataset_path)
df

Unnamed: 0,Student_ID,Age,Gender,University_Year,Sleep_Duration,Study_Hours,Screen_Time,Caffeine_Intake,Physical_Activity,Sleep_Quality,Weekday_Sleep_Start,Weekend_Sleep_Start,Weekday_Sleep_End,Weekend_Sleep_End
0,1,24,Other,2nd Year,7.7,7.9,3.4,2,37,10,14.16,4.05,7.41,7.06
1,2,21,Male,1st Year,6.3,6.0,1.9,5,74,2,8.73,7.10,8.21,10.21
2,3,22,Male,4th Year,5.1,6.7,3.9,5,53,5,20.00,20.47,6.88,10.92
3,4,24,Other,4th Year,6.3,8.6,2.8,4,55,9,19.82,4.08,6.69,9.42
4,5,20,Male,4th Year,4.7,2.7,2.7,0,85,3,20.98,6.12,8.98,9.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,24,Male,2nd Year,5.1,9.3,1.9,4,110,4,17.42,8.43,6.93,10.78
496,497,20,Male,2nd Year,8.9,7.7,3.5,3,40,4,1.22,15.54,5.85,7.23
497,498,21,Male,3rd Year,5.7,6.4,3.9,1,68,10,9.94,2.25,5.46,10.72
498,499,18,Female,2nd Year,4.9,0.5,3.5,0,12,2,19.10,15.49,8.35,7.20


## Get info about the dataset

In [5]:
print("Dataset Information:\n")
print(df.info())
print("\nSummary Statistics:\n")
print(df.describe())
print("\nChecking for Missing Values:\n")
print(df.isnull().sum())

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Student_ID           500 non-null    int64  
 1   Age                  500 non-null    int64  
 2   Gender               500 non-null    object 
 3   University_Year      500 non-null    object 
 4   Sleep_Duration       500 non-null    float64
 5   Study_Hours          500 non-null    float64
 6   Screen_Time          500 non-null    float64
 7   Caffeine_Intake      500 non-null    int64  
 8   Physical_Activity    500 non-null    int64  
 9   Sleep_Quality        500 non-null    int64  
 10  Weekday_Sleep_Start  500 non-null    float64
 11  Weekend_Sleep_Start  500 non-null    float64
 12  Weekday_Sleep_End    500 non-null    float64
 13  Weekend_Sleep_End    500 non-null    float64
dtypes: float64(7), int64(5), object(2)
memory usage: 54.8+ KB
None

Summ

## Convert "Gender" and "University_Year" into numerical features

In [6]:
# Ordinal Encoding for University_Year
ordinal_encoder = OrdinalEncoder(categories=[["1st Year", "2nd Year", "3rd Year", "4th Year"]])
df["University_Year"] = ordinal_encoder.fit_transform(df[["University_Year"]]).astype(np.int16)

# One-Hot Encoding for Gender
df = pd.get_dummies(df, columns=["Gender"], drop_first=True)

print(df)

     Student_ID  Age  University_Year  Sleep_Duration  Study_Hours  \
0             1   24                1             7.7          7.9   
1             2   21                0             6.3          6.0   
2             3   22                3             5.1          6.7   
3             4   24                3             6.3          8.6   
4             5   20                3             4.7          2.7   
..          ...  ...              ...             ...          ...   
495         496   24                1             5.1          9.3   
496         497   20                1             8.9          7.7   
497         498   21                2             5.7          6.4   
498         499   18                1             4.9          0.5   
499         500   21                2             7.9         11.6   

     Screen_Time  Caffeine_Intake  Physical_Activity  Sleep_Quality  \
0            3.4                2                 37             10   
1            1.9 

## Visualize the dataset

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Sleep_Quality'], kde=True, bins=10, color='skyblue')
plt.title('Distribution of Sleep Quality')
plt.xlabel('Sleep Quality')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.pairplot(df[['Sleep_Duration', 'Study_Hours', 'Screen_Time', 'Sleep_Quality']])
plt.show()

## Deal with missing values, skewness, outliers etc.

In [None]:
# Fill missing values if any (placeholder logic for now)
df.fillna(df.median(), inplace=True)

# Handle outliers using IQR method
def handle_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

for col in ['Sleep_Duration', 'Study_Hours', 'Screen_Time', 'Caffeine_Intake', 'Physical_Activity']:
    handle_outliers(col)

# Step 5: Perform feature analysis and engineering
# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['Gender', 'University_Year'], drop_first=True)

# Drop Student_ID as it is not predictive
df.drop(columns=['Student_ID'], inplace=True)

# Check multicollinearity
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap After Encoding')
plt.show()

# Step 6: Construct an ML model to predict Sleep_Quality
X = df.drop(columns=['Sleep_Quality'])
y = df['Sleep_Quality']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit a Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Step 7: Analyze the model's performance
y_pred = rf_model.predict(X_test)
print("Model Performance:")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

# Step 8: Fine-tune the model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters from Grid Search:\n", grid_search.best_params_)

# Evaluate the tuned model
tuned_model = grid_search.best_estimator_
tuned_y_pred = tuned_model.predict(X_test)
print("Tuned Model Performance:")
print(f"Mean Squared Error: {mean_squared_error(y_test, tuned_y_pred):.2f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, tuned_y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, tuned_y_pred):.2f}")

# Step 9: Interpret the model's decisions
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': tuned_model.feature_importances_})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance')
plt.show()