In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from keras_tuner.tuners import RandomSearch
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, randint
import joblib
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
Train_Data = pd.read_csv('/Combined_train.csv')
Evaluation_Data = pd.read_csv('/Combined_evaluate.csv')

In [None]:
Train_Data = Train_Data.iloc[:,1:]
Evaluation_Data = Evaluation_Data.iloc[:,1:]

In [None]:
Train_Data

Unnamed: 0,BVP,EDA,HR,TEMP,Label,Student_ID
0,-26.61,0.169126,83.00,28.39,0,1
1,44.90,0.172969,83.00,28.43,0,1
2,17.57,0.172969,72.67,28.43,0,1
3,91.18,0.176813,87.00,28.47,0,1
4,-136.83,0.169126,80.00,28.49,0,1
...,...,...,...,...,...,...
98875,121.64,0.466812,85.88,30.07,0,30
98876,24.11,0.462967,85.83,30.09,0,30
98877,-158.79,0.466812,85.87,30.09,0,30
98878,-65.75,0.477066,85.85,30.01,0,30


# Data Spliting and Scalling

In [None]:
def Data_Creator(student_data, window_size, step_size, split_ratio):

    # Initialize lists to store features and labels
    features_list = []
    labels_list = []

    features = np.array(student_data.iloc[:,:4])
    labels = np.array(student_data["Label"])

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    # Split the data into train, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=split_ratio, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

    # Standardize features (normalize to mean 0 and variance 1)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, X_test, X_val, y_train, y_test, y_val

# Decision Tree Classifier
Hyper Parameter Tuning and Cross Validaion

In [None]:
# Define parameter grid for hyperparameter tuning
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV tuner
tuner = RandomizedSearchCV(
    tree_classifier,
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    cv=3,               # Cross Validation Folds
    verbose=2,
    random_state=42,
    n_jobs=-1
)


# Iterate through each student's data
for student_id, student_data in Train_Data.groupby("Student_ID"):
    X_train, X_test, X_val, y_train, y_test, y_val = Data_Creator(student_data, 10, 5, 0.3)

    # Search for the best hyperparameters
    tuner.fit(X_train, y_train)

    # Get the best model and print the best hyperparameters
    DT_model = tuner.best_estimator_
    print("Best hyperparameters:", tuner.best_params_)

    # Evaluate the best model on the validation set
    val_acc = DT_model.score(X_val, y_val)
    print("Validation accuracy:", val_acc)

    # Evaluate the best model on the test set
    test_acc = DT_model.score(X_test, y_test)
    print("Test accuracy:", test_acc)

# Save the trained model using joblib
model_filename = "decision_tree_model.joblib"
joblib.dump(DT_model, model_filename)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters: {'splitter': 'random', 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 20, 'criterion': 'entropy'}
Validation accuracy: 0.49075975359342916
Test accuracy: 0.5532786885245902
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters: {'splitter': 'random', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20, 'criterion': 'entropy'}
Validation accuracy: 0.37523452157598497
Test accuracy: 0.5767790262172284
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters: {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}
Validation accuracy: 0.5201612903225806
Test accuracy: 0.6720321931589537
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters: {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10, 'criterion': 'gin

['decision_tree_model.joblib']

# Multi-Layer Perceptron (MLP)
Hyper Parameter Tuning and Cross Validation

In [None]:
# Define a function to build the model
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train.shape[1],)))

    # Tune the number of hidden layers and units in each layer
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(keras.layers.Dense(units=hp.Int('units_' + str(i), 32, 128, step=32),
                                     activation='relu'))
        model.add(keras.layers.Dropout(hp.Float('dropout_' + str(i), 0, 0.5, step=0.1)))

    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

# Instantiate the RandomSearch tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='tuner_directory',
    project_name='stress_prediction'
)


# Iterate through each student's data
for student_id, student_data in Train_Data.groupby("Student_ID"):
    X_train, X_test, X_val, y_train, y_test, y_val = Data_Creator(student_data, 10, 5, 0.3)
    # Search for the best hyperparameters
    tuner.search(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

    # Get the best model and print the summary
    best_model = tuner.get_best_models(num_models=1)[0]
    best_model.summary()

    # Train the best model with more epochs
    best_model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_val, y_val))

    # Evaluate the best model on the test set
    test_loss, test_acc = best_model.evaluate(X_test, y_test)
    print("Test accuracy:", test_acc)

# Save the best model
best_model.save('MLP_model.h5')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test accuracy: 1.0
Model: "sequential"
___________

# Gradient Boosting Classifier
Hyper Parameter Tuning and Cross Validation

In [None]:
# Define the hyperparameter search space
param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'subsample': uniform(0.5, 0.5),
    'max_features': ['sqrt', 'log2', None]
}

# Instantiate the Gradient Boosting classifier
clf = GradientBoostingClassifier(random_state=42)

# Instantiate the RandomizedSearchCV tuner
tuner = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=3,             # Cross Validation Folds
    n_jobs=-1
)

for student_id, student_data in Train_Data.groupby("Student_ID"):
    X_train, X_test, X_val, y_train, y_test, y_val = Data_Creator(student_data, 10, 5, 0.3)

    # Search for the best hyperparameters
    tuner.fit(X_train, y_train)

    # Get the best estimator
    best_clf = tuner.best_estimator_

    # Print the best hyperparameters
    print("Best hyperparameters:", tuner.best_params_)

    # Evaluate the best model on the test set
    test_acc = best_clf.score(X_test, y_test)
    print("Test accuracy:", test_acc)

# Save the trained model using joblib
model_filename = "GradientBoostClassifer_model.joblib"
joblib.dump(best_clf, model_filename)

Best hyperparameters: {'learning_rate': 0.19739493188944465, 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 16, 'min_samples_split': 15, 'n_estimators': 66, 'subsample': 0.9988840459634154}
Test accuracy: 0.6270491803278688
Best hyperparameters: {'learning_rate': 0.012445480151862066, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 7, 'n_estimators': 195, 'subsample': 0.9334206082814627}
Test accuracy: 0.5374531835205992
Best hyperparameters: {'learning_rate': 0.10284013852454546, 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 11, 'min_samples_split': 6, 'n_estimators': 98, 'subsample': 0.593140574841214}
Test accuracy: 0.6016096579476862
Best hyperparameters: {'learning_rate': 0.1948758223573631, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 19, 'n_estimators': 153, 'subsample': 0.7729764529630861}
Test accuracy: 1.0
Best hyperparameters: {'learning_rate': 0.11098155036543327, 'max_depth'

['GradientBoostClassifer_model.joblib']

In [None]:
preds = best_clf.predict(X_test)
print(classification_report(preds, y_test))
print("Confusion Metrix", confusion_matrix(preds, y_test))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      9234
           1       0.89      0.93      0.91      5598

    accuracy                           0.93     14832
   macro avg       0.92      0.93      0.92     14832
weighted avg       0.93      0.93      0.93     14832

Confusion Metrix [[8580  654]
 [ 407 5191]]


# Evaluating Models on last 5 students data

In [None]:
# Separate features and labels
eval_features = Evaluation_Data[['BVP', 'EDA', 'HR', 'TEMP']]
eval_labels = Evaluation_Data['Label']

# Encode evaluation labels (0: no stress, 1: stressed)
eval_labels_encoded = label_encoder.transform(eval_labels)

# Standardize evaluation features
X_eval = scaler.transform(eval_features)

In [None]:

# for student_id, student_data in Evaluation_Data.groupby("Student_ID"):
#     X_train, X_test, X_val, y_train, y_test, y_val = Data_Creator(student_data, 10, 5, 0.1)
# Evaluate Decision Tree on the evaluation set

eval_acc = DT_model.score(X_train, y_train)
print("Evaluation accuracy:", eval_acc)

preds = DT_model.predict(X_eval)
print(classification_report(preds, eval_labels_encoded))
print("Confusion Metrix", confusion_matrix(preds, eval_labels_encoded))

Evaluation accuracy: 0.9858186506231199
              precision    recall  f1-score   support

           0       0.43      0.62      0.51      7331
           1       0.56      0.37      0.45      9459

    accuracy                           0.48     16790
   macro avg       0.50      0.50      0.48     16790
weighted avg       0.50      0.48      0.47     16790

Confusion Metrix [[4543 2788]
 [5947 3512]]




In [None]:
# Evaluate Multi-Layer Perceptron (MLP) on the evaluation set

test_loss, test_acc = best_model.evaluate(X_eval, eval_labels_encoded)
print("Evaluation accuracy:", test_acc)

Evaluation accuracy: 0.6247766613960266


In [None]:
# Evaluate Gradient Boosting Classifier on the evaluation set

eval_acc = best_clf.score(X_eval, eval_labels_encoded)
print("Evaluation accuracy:", eval_acc)

preds = best_clf.predict(X_eval)
print(classification_report(preds, eval_labels_encoded))
print("Confusion Metrix", confusion_matrix(preds, eval_labels_encoded))

Evaluation accuracy: 0.6247766527695057
              precision    recall  f1-score   support

           0       1.00      0.62      0.77     16790
           1       0.00      0.00      0.00         0

    accuracy                           0.62     16790
   macro avg       0.50      0.31      0.38     16790
weighted avg       1.00      0.62      0.77     16790

Confusion Metrix [[10490  6300]
 [    0     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
