In [1]:
# import required modules
import os
import shutil
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from stress.stress import (
    unzip, combine,
    merge, convert_to_gmt,
    label_data, train_model)

# UNZIP THE DATASETS

In [2]:
# unzip all the folders
MAIN_PATH = 'C:/Users/HP/Documents/CE888'
unzip(MAIN_PATH, 'unzip.zip')

Done extracting


In [3]:
# Import necessary libraries
import os
import pandas as pd

# Define paths
DATA_PATH = "C:/Users/HP/Documents/CE888/Stress_dataset"
SAVE_PATH = "C:/Users/HP/Documents/CE888/data"

# Create a new directory to store the combined dataset
os.mkdir(SAVE_PATH)

# Define columns for the final dataset
final_columns = {
    'ACC': ['id', 'X', 'Y', 'Z', 'datetime'],
    'EDA': ['id', 'EDA', 'datetime'],
    'HR': ['id', 'HR', 'datetime'],
    'TEMP': ['id', 'TEMP', 'datetime'],
}

# Define signal names and corresponding columns
names = {
    'ACC.csv': ['X', 'Y', 'Z'],
    'EDA.csv': ['EDA'],
    'HR.csv': ['HR'],
    'TEMP.csv': ['TEMP'],
}

# Define desired signals to be included in the final dataset
desired_signals = ['ACC.csv', 'EDA.csv', 'HR.csv']

# Combine the desired signals into a single dataset
combine(DATA_PATH, SAVE_PATH, final_columns, names, desired_signals)

In [4]:
COMBINED_DATA_PATH = "C:/Users/HP/Documents/CE888/data"
SAVE_PATH = "C:/Users/HP/Documents/CE888/data_merged"
os.mkdir(SAVE_PATH)

# merge and save all csv files into a single csv file
merge(COMBINED_DATA_PATH, SAVE_PATH)

Reading data ...
Merging data ...
Saving data ...


In [5]:
# This is ram intensive
df = pd.read_parquet('C:/Users/HP/Documents/CE888/data_merged/merged_data.parquet')
df = df.sample(frac=1).reset_index(drop=True)
df[0:4000000].to_parquet('merged_data_subset2.parquet', index=False)

In [6]:
df = pd.read_parquet('merged_data_subset2.parquet')

In [7]:
df.head()

Unnamed: 0,id,X,Y,Z,datetime,EDA,TEMP,HR
0,7A,-55.0,9.0,11.0,1587666000.0,0.310405,,81.58
1,7A,-59.0,-10.0,26.0,1587308000.0,10.792609,,87.02
2,8B,26.0,-29.0,-49.0,1594932000.0,1.38196,,87.02
3,6B,-51.0,-17.0,43.0,1593430000.0,8.096954,,91.1
4,6B,-58.0,10.0,1.0,1593457000.0,4.221127,,118.8


In [8]:
# Multiply each datetime value in df by 1 billion to convert from seconds to nanoseconds
datetime_list = [1e9*x for x in df['datetime']]

# Replace the 'datetime' column in df with the converted datetime values
df['datetime'] = datetime_list

# Convert the 'datetime' column in df to datetime format using the pd.to_datetime() method
df['datetime'] = pd.to_datetime(df['datetime'])

# Convert the 'id' column in df to string format using the .astype() method
df['id'] = df['id'].astype(str)


In [9]:
# Print the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   id        object        
 1   X         float64       
 2   Y         float64       
 3   Z         float64       
 4   datetime  datetime64[ns]
 5   EDA       float64       
 6   TEMP      float64       
 7   HR        float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 244.1+ MB


In [10]:
# Print the first 5 values of the dataframe
df.head()

Unnamed: 0,id,X,Y,Z,datetime,EDA,TEMP,HR
0,7A,-55.0,9.0,11.0,2020-04-23 18:16:24.750000128,0.310405,,81.58
1,7A,-59.0,-10.0,26.0,2020-04-19 14:53:37.000000000,10.792609,,87.02
2,8B,26.0,-29.0,-49.0,2020-07-16 20:37:10.593750016,1.38196,,87.02
3,6B,-51.0,-17.0,43.0,2020-06-29 11:18:33.343749888,8.096954,,91.1
4,6B,-58.0,10.0,1.0,2020-06-29 18:59:54.656250112,4.221127,,118.8


In [11]:
print("Reading 2 ...")
# Set the path to the Excel file containing survey results
survey_path = 'C:/Users/HP/Documents/CE888/SurveyResults.xlsx'

# Load the survey data into a Pandas DataFrame using the read_excel() function
# Use the 'ID', 'Start time', 'End time', 'date', and 'Stress level' columns only
# Set the 'ID' column to string data type using the dtype parameter
survey_df = pd.read_excel(survey_path, usecols=['ID', 'Start time', 'End time', 'date', 'Stress level'], dtype={'ID': str})

# Replace any 'na' values in the 'Stress level' column with NaN values
survey_df['Stress level'].replace('na', np.nan, inplace=True)

# Remove any rows in survey_df that contain NaN values
survey_df.dropna(inplace=True)

# Print the resulting DataFrame
survey_df

Reading 2 ...


Unnamed: 0,ID,Start time,End time,date,Stress level
0,5C,08:00:00,09:00:00,2020-04-15,1.0
1,5C,17:31:00,17:58:00,2020-04-14,1.0
2,E4,15:32:00,15:37:00,2020-04-18,2.0
3,E4,14:05:00,14:11:00,2020-04-18,2.0
4,7A,13:52:00,14:03:00,2020-04-18,2.0
...,...,...,...,...,...
353,83,23:05:00,23:50:00,2020-12-12,2.0
354,83,00:12:00,02:01:00,2020-12-13,2.0
355,83,20:34:00,20:48:00,2020-12-11,2.0
356,83,20:54:00,21:13:00,2020-12-11,2.0


In [12]:
# Add a new column 'Start datetime' to survey_df containing the combined datetime value from 'date' and 'Start time' columns
survey_df['Start datetime'] = pd.to_datetime(survey_df['date'].map(str) + ' ' + survey_df['Start time'].map(str))

# Add a new column 'End datetime' to survey_df containing the combined datetime value from 'date' and 'End time' columns
survey_df['End datetime'] = pd.to_datetime(survey_df['date'].map(str) + ' ' + survey_df['End time'].map(str))

# Remove the columns 'Start time', 'End time', and 'date' from survey_df using the drop() method
# The 'axis=1' parameter specifies that the columns should be dropped, not rows
# The 'inplace=True' parameter specifies that the changes should be made to survey_df directly
survey_df.drop(['Start time', 'End time', 'date'], axis=1, inplace=True)


In [13]:
survey_df

Unnamed: 0,ID,Stress level,Start datetime,End datetime
0,5C,1.0,2020-04-15 08:00:00,2020-04-15 09:00:00
1,5C,1.0,2020-04-14 17:31:00,2020-04-14 17:58:00
2,E4,2.0,2020-04-18 15:32:00,2020-04-18 15:37:00
3,E4,2.0,2020-04-18 14:05:00,2020-04-18 14:11:00
4,7A,2.0,2020-04-18 13:52:00,2020-04-18 14:03:00
...,...,...,...,...
353,83,2.0,2020-12-12 23:05:00,2020-12-12 23:50:00
354,83,2.0,2020-12-13 00:12:00,2020-12-13 02:01:00
355,83,2.0,2020-12-11 20:34:00,2020-12-11 20:48:00
356,83,2.0,2020-12-11 20:54:00,2020-12-11 21:13:00


In [14]:
# Convert SurveyResults.xlsx to GMT-00:00
survey_df = convert_to_gmt(survey_df)

Converting ...
Adjust daylight savings
Concatenate dataframes


In [15]:
survey_df

Unnamed: 0,ID,Stress level,Start datetime,End datetime
0,5C,1.0,2020-04-15 13:00:00,2020-04-15 14:00:00
1,5C,1.0,2020-04-14 22:31:00,2020-04-14 22:58:00
2,E4,2.0,2020-04-18 20:32:00,2020-04-18 20:37:00
3,E4,2.0,2020-04-18 19:05:00,2020-04-18 19:11:00
4,7A,2.0,2020-04-18 18:52:00,2020-04-18 19:03:00
...,...,...,...,...
240,83,2.0,2020-12-13 05:05:00,2020-12-13 05:50:00
241,83,2.0,2020-12-13 06:12:00,2020-12-13 08:01:00
242,83,2.0,2020-12-12 02:34:00,2020-12-12 02:48:00
243,83,2.0,2020-12-12 02:54:00,2020-12-12 03:13:00


In [16]:
# label the unlabeled dataset
label_data(df, survey_df)

Labelling ...
Processing ID 7A ...
Found 772195 rows for ID 7A
Found 35 survey rows for ID 7A
7A is missing label 2.0 at 2020-07-07 19:16:00 to 2020-07-07 19:27:00
7A is missing label 2.0 at 2020-07-07 19:50:00 to 2020-07-07 20:09:00
7A is missing label 0.0 at 2020-07-07 20:24:00 to 2020-07-07 20:57:00
Processing ID 8B ...
Found 265336 rows for ID 8B
Found 16 survey rows for ID 8B
8B is missing label 2.0 at 2020-07-13 16:59:00 to 2020-07-13 17:05:00
Processing ID 6B ...
Found 564294 rows for ID 6B
Found 13 survey rows for ID 6B
Processing ID 6D ...
Found 232625 rows for ID 6D
Found 4 survey rows for ID 6D
6D is missing label 1.0 at 2020-06-03 07:00:00 to 2020-06-03 09:00:00
Processing ID 7E ...
Found 492170 rows for ID 7E
Found 7 survey rows for ID 7E
Processing ID 5C ...
Found 965611 rows for ID 5C
Found 12 survey rows for ID 5C
5C is missing label 1.0 at 2020-04-15 13:00:00 to 2020-04-15 14:00:00
5C is missing label 0.0 at 2020-06-12 07:00:00 to 2020-06-12 08:00:00
Processing ID 15 .

In [17]:
PATH = 'C:/Users/HP/Documents/CE888/complete_data'
os.mkdir(PATH)
data = pd.read_csv(f'{PATH}/merged_data_labeled2.csv')

In [18]:
data

Unnamed: 0,X,Y,Z,EDA,HR,TEMP,id,datetime,label
0,-55.0,20.0,32.0,4.985559,100.08,,7A,2020-04-18 18:52:13.375000064,2.0
1,-47.0,2.0,44.0,3.255478,76.78,,7A,2020-04-18 19:02:15.281250048,2.0
2,-47.0,3.0,44.0,3.388761,76.05,,7A,2020-04-18 19:01:35.000000000,2.0
3,-43.0,0.0,48.0,5.478307,84.53,,7A,2020-04-18 18:54:06.531249920,2.0
4,-61.0,-23.0,11.0,5.773067,82.32,,7A,2020-04-18 18:55:10.218749952,2.0
...,...,...,...,...,...,...,...,...,...
391855,23.0,51.0,29.0,2.331232,102.38,,15,2020-07-31 20:09:27.093750016,2.0
391856,-60.0,24.0,8.0,2.127644,108.78,,15,2020-07-31 20:09:01.687500032,2.0
391857,-3.0,3.0,67.0,1.591848,104.58,,15,2020-07-31 20:01:32.218749952,2.0
391858,20.0,46.0,41.0,1.602099,98.68,,15,2020-07-31 20:02:31.093750016,2.0


In [19]:
data.label.value_counts()

2.0    229832
0.0    104043
1.0     57985
Name: label, dtype: int64

In [20]:
data.head()

Unnamed: 0,X,Y,Z,EDA,HR,TEMP,id,datetime,label
0,-55.0,20.0,32.0,4.985559,100.08,,7A,2020-04-18 18:52:13.375000064,2.0
1,-47.0,2.0,44.0,3.255478,76.78,,7A,2020-04-18 19:02:15.281250048,2.0
2,-47.0,3.0,44.0,3.388761,76.05,,7A,2020-04-18 19:01:35.000000000,2.0
3,-43.0,0.0,48.0,5.478307,84.53,,7A,2020-04-18 18:54:06.531249920,2.0
4,-61.0,-23.0,11.0,5.773067,82.32,,7A,2020-04-18 18:55:10.218749952,2.0


In [21]:
# Add a new column 'Magnitude' to the DataFrame 'data' 
# The 'Magnitude' column contains the square root of the sum of squares of the 'X', 'Y', and 'Z' columns
data['Magnitude'] = np.sqrt(data['X']**2 + data['Y']**2 + data['Z']**2)


In [22]:
data.head()

Unnamed: 0,X,Y,Z,EDA,HR,TEMP,id,datetime,label,Magnitude
0,-55.0,20.0,32.0,4.985559,100.08,,7A,2020-04-18 18:52:13.375000064,2.0,66.700825
1,-47.0,2.0,44.0,3.255478,76.78,,7A,2020-04-18 19:02:15.281250048,2.0,64.412732
2,-47.0,3.0,44.0,3.388761,76.05,,7A,2020-04-18 19:01:35.000000000,2.0,64.451532
3,-43.0,0.0,48.0,5.478307,84.53,,7A,2020-04-18 18:54:06.531249920,2.0,64.443774
4,-61.0,-23.0,11.0,5.773067,82.32,,7A,2020-04-18 18:55:10.218749952,2.0,66.113539


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391860 entries, 0 to 391859
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   X          391860 non-null  float64
 1   Y          391860 non-null  float64
 2   Z          391860 non-null  float64
 3   EDA        391860 non-null  float64
 4   HR         391860 non-null  float64
 5   TEMP       0 non-null       float64
 6   id         391860 non-null  object 
 7   datetime   391860 non-null  object 
 8   label      391860 non-null  float64
 9   Magnitude  391860 non-null  float64
dtypes: float64(8), object(2)
memory usage: 29.9+ MB


In [24]:
data.head()

Unnamed: 0,X,Y,Z,EDA,HR,TEMP,id,datetime,label,Magnitude
0,-55.0,20.0,32.0,4.985559,100.08,,7A,2020-04-18 18:52:13.375000064,2.0,66.700825
1,-47.0,2.0,44.0,3.255478,76.78,,7A,2020-04-18 19:02:15.281250048,2.0,64.412732
2,-47.0,3.0,44.0,3.388761,76.05,,7A,2020-04-18 19:01:35.000000000,2.0,64.451532
3,-43.0,0.0,48.0,5.478307,84.53,,7A,2020-04-18 18:54:06.531249920,2.0,64.443774
4,-61.0,-23.0,11.0,5.773067,82.32,,7A,2020-04-18 18:55:10.218749952,2.0,66.113539


In [25]:
data.isnull().sum()

X                 0
Y                 0
Z                 0
EDA               0
HR                0
TEMP         391860
id                0
datetime          0
label             0
Magnitude         0
dtype: int64

In [26]:
data.drop(['id', 'datetime', 'TEMP'], axis=1, inplace=True)

In [39]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X = data.drop('label', axis=1)
y = data.label

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
y_train.value_counts()

2.0    183857
0.0     83372
1.0     46259
Name: label, dtype: int64

In [31]:
y_test.value_counts()

2.0    45975
0.0    20671
1.0    11726
Name: label, dtype: int64

# LOGISTIC REGRESSION

In [32]:
# train the logistic regression model
linear_model = LogisticRegression(random_state=42, n_jobs=-1, max_iter=5000, class_weight='balanced', penalty='l2')
linear_model, linear_report, linear_scores = train_model(linear_model, X_train, X_test, y_train, y_test)
print(linear_report)

              precision    recall  f1-score   support

         0.0       0.43      0.72      0.54     20671
         1.0       0.23      0.26      0.25     11726
         2.0       0.77      0.50      0.61     45975

    accuracy                           0.53     78372
   macro avg       0.48      0.50      0.46     78372
weighted avg       0.60      0.53      0.54     78372



# CATBOOST

In [33]:
# train the catboost model
X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

cat = CatBoostClassifier(iterations=30000,
                         depth=4,
                         random_seed=426,
                         random_strength=5,
                         od_type='Iter',
                         od_wait=20,
                         )
cat, cat_report, cat_scores = train_model(model=cat,
                                          X_train=X_train_cat,
                                          X_test=X_test,
                                          y_train=y_train_cat,
                                          y_test=y_test,
                                          X_valid=X_valid_cat,
                                          y_valid=y_valid_cat)

Learning rate set to 0.032967
0:	learn: 1.0773318	test: 1.0773458	best: 1.0773458 (0)	total: 242ms	remaining: 2h 48s
500:	learn: 0.6203173	test: 0.6186065	best: 0.6186065 (500)	total: 25.9s	remaining: 25m 26s
1000:	learn: 0.5707429	test: 0.5707055	best: 0.5707055 (1000)	total: 52.3s	remaining: 25m 14s
1500:	learn: 0.5444686	test: 0.5457486	best: 0.5457486 (1500)	total: 1m 18s	remaining: 24m 48s
2000:	learn: 0.5258315	test: 0.5284219	best: 0.5284219 (2000)	total: 1m 44s	remaining: 24m 24s
2500:	learn: 0.5117314	test: 0.5156714	best: 0.5156714 (2500)	total: 2m 10s	remaining: 23m 57s
3000:	learn: 0.5000581	test: 0.5050607	best: 0.5050607 (3000)	total: 2m 36s	remaining: 23m 30s
3500:	learn: 0.4903531	test: 0.4962768	best: 0.4962768 (3500)	total: 3m 2s	remaining: 23m 2s
4000:	learn: 0.4821367	test: 0.4890199	best: 0.4890199 (4000)	total: 3m 28s	remaining: 22m 35s
4500:	learn: 0.4746389	test: 0.4823586	best: 0.4823586 (4500)	total: 3m 54s	remaining: 22m 7s
5000:	learn: 0.4676913	test: 0.4762

In [34]:
print(cat_report)

              precision    recall  f1-score   support

         0.0       0.86      0.78      0.82     20671
         1.0       0.89      0.65      0.76     11726
         2.0       0.85      0.94      0.89     45975

    accuracy                           0.86     78372
   macro avg       0.87      0.79      0.82     78372
weighted avg       0.86      0.86      0.85     78372



# GRADIENT BOOSTING

In [35]:
# train the gradient boosting model
gradient_boost = HistGradientBoostingClassifier(
                                 max_iter=10000,
                                 early_stopping='auto',
                                 max_depth=4,
                                 n_iter_no_change=20,
                                 random_state=4288,
                                 validation_fraction=0.1)

gradient_boost, gradient_report, gradient_scores = train_model(model=gradient_boost,
                                          X_train=X_train,
                                          X_test=X_test,
                                          y_train=y_train,
                                          y_test=y_test)

In [36]:
print(gradient_report)

              precision    recall  f1-score   support

         0.0       0.89      0.84      0.86     20671
         1.0       0.90      0.76      0.82     11726
         2.0       0.89      0.95      0.91     45975

    accuracy                           0.89     78372
   macro avg       0.89      0.85      0.87     78372
weighted avg       0.89      0.89      0.89     78372



# DECISION TREE

In [37]:
# train the decision tree model
decision_tree= DecisionTreeClassifier(random_state=42)

decision_tree, decision_report, decision_scores = train_model(decision_tree, X_train, X_test, y_train, y_test)
print(decision_report)

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85     20671
         1.0       0.79      0.80      0.80     11726
         2.0       0.90      0.90      0.90     45975

    accuracy                           0.87     78372
   macro avg       0.85      0.85      0.85     78372
weighted avg       0.87      0.87      0.87     78372



# RANDOM FOREST

In [41]:
# train the random forest model
random_forest = RandomForestClassifier(n_jobs=-1,
                                       random_state=42)

random_forest, random_report, random_scores = train_model(random_forest, X_train, X_test, y_train, y_test)
print(random_report)

              precision    recall  f1-score   support

         0.0       0.92      0.85      0.89     20671
         1.0       0.95      0.77      0.85     11726
         2.0       0.89      0.97      0.93     45975

    accuracy                           0.91     78372
   macro avg       0.92      0.86      0.89     78372
weighted avg       0.91      0.91      0.91     78372



In [45]:
# Create an empty DataFrame 'scores' with columns and index labels
# The columns represent the different models and the index labels represent the evaluation metrics
scores = pd.DataFrame(columns = ['LogisticRegression', 'CatBoostClassifier', 'GradientBoostingClassifier', 'DecisionTreeClassifier', 'RandomForestClassifier'],
                      index =['Accuracy', 'Precision', 'Recall', 'F1'])

# Assign the values from the evaluation metrics of each model to the corresponding columns in 'scores'
scores['LogisticRegression'] = linear_scores
scores['CatBoostClassifier'] = cat_scores
scores['GradientBoostingClassifier'] = gradient_scores
scores['DecisionTreeClassifier'] = decision_scores
scores['RandomForestClassifier'] = random_scores

# For each column in 'scores', apply a lambda function to round each value to two decimal places and multiply by 100
for i in scores.columns:
    scores[i] = scores[i].apply(lambda x: round(x*100, 2))

scores

Unnamed: 0,LogisticRegression,CatBoostClassifier,GradientBoostingClassifier,DecisionTreeClassifier,RandomForestClassifier
Accuracy,52.52,85.51,88.87,87.12,90.68
Precision,47.57,86.58,89.23,84.71,92.05
Recall,49.67,79.18,84.62,85.04,86.3
F1,46.43,82.14,86.65,84.87,88.78
