# Predicting Stress Levels from Smart Watch Data

## Data Wrangling and Cleaning

In [4]:
import pandas as pd
import numpy as np

watch_data = pd.read_csv("unclean_smartwatch_health_data.csv").dropna()

watch_data_filtered = watch_data[watch_data['Sleep Duration (hours)'] != "ERROR"]

watch_data_filtered = watch_data_filtered.drop('User ID', axis=1)

watch_data_filtered

#Activity Level reports have 3 main categories, but spelling varies across entries

# Strip leading/trailing whitespace and map based on first letter
def clean_activity_level(level):
    level = level.strip().lower()
    if level.startswith('h'):
        return 'Highly_Active'
    elif level.startswith('a'):
        return 'Active'
    elif level.startswith('s'):
        return 'Sedentary'
    return level

watch_data_filtered['Activity Level'] = watch_data_filtered['Activity Level'].apply(clean_activity_level)

watch_data_filtered


Unnamed: 0,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,58.939776,98.809650,5450.390578,7.167235622316564,Highly_Active,1
3,40.000000,96.894213,13797.338044,7.367789630207228,Active,3
5,96.285938,94.202910,10205.992256,8.378342673824589,Highly_Active,10
6,47.272257,95.389760,3208.781177,7.871146008904113,Sedentary,2
7,81.733497,95.981343,6051.249857,5.224139066195455,Sedentary,1
...,...,...,...,...,...,...
9994,77.912299,98.640583,10061.145291,5.428634630125767,Sedentary,10
9995,78.819386,98.931927,2948.491953,7.402748595032027,Active,7
9996,48.632659,95.773035,4725.623070,6.3821659358529015,Sedentary,2
9997,73.834442,97.945874,2571.492060,6.91654920303435,Sedentary,4


## Data Preprocessing

### One Hot Encoding for Categorical Data

In [None]:
watch_data_encoded = pd.get_dummies(
    watch_data_filtered,
    columns=['Activity Level'],
    prefix='activity',
    prefix_sep='_',
)


### Feature Normalization

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

#Set Sleep duration to float seperately b/c of some issues otherwise
watch_data_encoded = watch_data_encoded[watch_data_encoded["Sleep Duration (hours)"] != ""]
watch_data_encoded['Sleep Duration (hours)'] = watch_data_encoded['Sleep Duration (hours)'].astype(float)

#Standardize all predictive features

watch_data_encoded["Heart Rate (BPM)"] =( watch_data_encoded["Heart Rate (BPM)"] - watch_data_encoded["Heart Rate (BPM)"].mean() ) / watch_data_encoded["Heart Rate (BPM)"].std()
watch_data_encoded["Blood Oxygen Level (%)"] =( watch_data_encoded["Blood Oxygen Level (%)"] - watch_data_encoded["Blood Oxygen Level (%)"].mean() ) / watch_data_encoded["Blood Oxygen Level (%)"].std()
watch_data_encoded["Step Count"] =( watch_data_encoded["Step Count"] - watch_data_encoded["Step Count"].mean() ) / watch_data_encoded["Step Count"].std()
watch_data_encoded["Sleep Duration (hours)"] =( watch_data_encoded["Sleep Duration (hours)"] - watch_data_encoded["Sleep Duration (hours)"].mean() ) / watch_data_encoded["Sleep Duration (hours)"].std()

watch_data_encoded

Unnamed: 0,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Stress Level,activity_Active,activity_Highly_Active,activity_Sedentary
0,-0.896965,0.558714,-0.224498,0.439027,1,0,1,0
3,-1.893082,-0.543046,0.981444,0.571755,3,1,0,0
5,1.067217,-2.091086,0.462577,1.240543,10,0,1,0
6,-1.510606,-1.408409,-0.548359,0.904878,2,0,0,1
7,0.301847,-1.068130,-0.137687,-0.846922,1,0,0,1
...,...,...,...,...,...,...,...,...
9994,0.100875,0.461467,0.441650,-0.711586,10,0,0,1
9995,0.148582,0.629048,-0.585964,0.594891,7,1,0,0
9996,-1.439057,-1.187949,-0.329210,-0.080535,2,0,0,1
9997,-0.113595,0.061870,-0.640432,0.273122,4,0,0,1


### Feature and Label Setup

In [None]:
y = watch_data_encoded["Stress Level"]
X = watch_data_encoded.loc[:, watch_data_encoded.columns != 'Stress Level']

print("Feature shape: ", X.shape)
print("Label shape: ", y.shape)

Feature shape:  (8325, 7)
Label shape:  (8325,)


0        1
3        3
5       10
6        2
7        1
        ..
9994    10
9995     7
9996     2
9997     4
9999     5
Name: Stress Level, Length: 8325, dtype: object

## Ridge Regression Modelling

### Setup