### Model Exploration

### Cross validation

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Read the dataset and preprocess it
file_path = r'C:\Users\fahad\Documents\PostGrad\CE888\Stress_dataset\Stress-Predict-Dataset-main\participants\combined_data.csv'
df = pd.read_csv(file_path)
df = df.drop(['Timestamp', 'participant_number'], axis=1)
df = df.fillna(df.mean())
df = df.groupby('stress_class').filter(lambda x: len(x) > 1)

# Split the dataset into features and target
X = df.drop('stress_class', axis=1)
y = df['stress_class']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define the classifiers
classifiers = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
]

# Initialize StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Train and evaluate the classifiers
for name, classifier in classifiers:
    print(f"Classifier: {name}")
    classifier.fit(X_train_smote, y_train_smote)
    y_pred = classifier.predict(X_test)
    
    # Perform cross-validation
    scores = cross_val_score(classifier, X_train_smote, y_train_smote, cv=cv, scoring='accuracy')
    avg_score = scores.mean()
    
    conf_mat = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    # Extract the false negatives from the confusion matrix
    false_negatives = conf_mat[1][0]
    
    print("Confusion Matrix:")
    print(conf_mat)
    print("Classification Report:")
    print(class_report)
    print(f"Average Cross-Validation Score: {avg_score:.4f}")
    print(f"False Negatives: {false_negatives}")
    print("\n" + "=" * 80 + "\n")





Classifier: Random Forest
Confusion Matrix:
[[12661  2668]
 [ 1340  2451]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.83      0.86     15329
         1.0       0.48      0.65      0.55      3791

    accuracy                           0.79     19120
   macro avg       0.69      0.74      0.71     19120
weighted avg       0.82      0.79      0.80     19120

Average Cross-Validation Score: 0.8230
False Negatives: 1340


Classifier: Gradient Boosting
Confusion Matrix:
[[9368 5961]
 [1237 2554]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.61      0.72     15329
         1.0       0.30      0.67      0.42      3791

    accuracy                           0.62     19120
   macro avg       0.59      0.64      0.57     19120
weighted avg       0.77      0.62      0.66     19120

Average Cross-Validation Score: 0.6509
False Negatives: 1237


Classifier: Logistic Reg

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Confusion Matrix:
[[11174  4155]
 [ 1464  2327]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.73      0.80     15329
         1.0       0.36      0.61      0.45      3791

    accuracy                           0.71     19120
   macro avg       0.62      0.67      0.63     19120
weighted avg       0.78      0.71      0.73     19120

Average Cross-Validation Score: 0.7782
False Negatives: 1464


Classifier: XGBoost




Confusion Matrix:
[[10859  4470]
 [  882  2909]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.71      0.80     15329
         1.0       0.39      0.77      0.52      3791

    accuracy                           0.72     19120
   macro avg       0.66      0.74      0.66     19120
weighted avg       0.82      0.72      0.75     19120

Average Cross-Validation Score: 0.7538
False Negatives: 882




### Main Model

In [8]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Set the file path
file_path = r'C:\Users\fahad\Documents\PostGrad\CE888\Stress_dataset\Stress-Predict-Dataset-main\participants\combined_data.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Drop Timestamp and participant_number columns as they're not useful for prediction
df = df.drop(['Timestamp', 'participant_number'], axis=1)

# Handle missing values by filling them with the mean value of the respective column
df = df.fillna(df.mean())

# Remove classes with only one sample
df = df.groupby('stress_class').filter(lambda x: len(x) > 1)

# Separate the features (X) and target variable (y)
X = df.drop('stress_class', axis=1)
y = df['stress_class']

# Perform feature scaling to standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform feature selection using ANOVA F-value (f_classif) and keep all features for now
selector = SelectKBest(score_func=f_classif, k='all')
X_selected = selector.fit_transform(X_scaled, y)

# Split the data into training and test sets using stratified sampling to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Oversample the minority class using SMOTE to balance the class distribution
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Set the best hyperparameters for the Random Forest classifier
best_params_rf = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 40}

# Create the Random Forest Classifier with the best hyperparameters
rf_clf = RandomForestClassifier(**best_params_rf, random_state=42)

# Set the best hyperparameters for the XGBoost classifier
best_params_xgb = {'max_depth': 5, 'n_estimators': 100, 'learning_rate': 0.1}

# Create the XGBoost Classifier with the best hyperparameters
xgb_clf = XGBClassifier(**best_params_xgb, random_state=42)

# Create a voting classifier with the two classifiers, using soft voting to average probabilities
voting_clf = VotingClassifier(estimators=[('rf', rf_clf), ('xgb', xgb_clf)], voting='soft')

# Train the voting classifier on the oversampled training data
voting_clf.fit(X_train_smote, y_train_smote)

#In the last part of the code, we are calculating the ROC curve and AUC score to evaluate the performance of the classifier. 
#The AUC score represents the probability that a randomly chosen positive instance is ranked higher than a randomly chosen negative instance. A higher AUC score indicates a better classifier.
#We then find the optimal threshold that maximizes the difference between
#the True Positive Rate (TPR) and the False Positive Rate (FPR). 
#This threshold helps us balance the trade-off between TPR and FPR. By updating the predicted classes 
#based on the optimal threshold, we can improve the performance of the classifier.
#Finally, we print the updated classification report, confusion matrix, and various performance metrics
#like accuracy, precision, recall, and F1 score to assess the model's performance after adjusting the threshold.

# Calculate the predicted probabilities for the test set
y_pred_prob = voting_clf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.2f}")

#Find the optimal threshold by maximizing the difference between TPR (True Positive Rate) and FPR (False Positive Rate)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

#Update the predicted classes based on the optimal threshold
y_pred = (y_pred_prob >= optimal_threshold).astype(int)

#Print the updated classification report
print("Updated Classification Report:")
print(classification_report(y_test, y_pred))

#Calculate and print the updated confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("Updated Confusion Matrix:")
print(conf_mat)

#calculate and print various performance metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.2f}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.2f}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1:.2f}")




ROC AUC Score: 0.85
Updated Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.77      0.84     15329
         1.0       0.45      0.76      0.57      3791

    accuracy                           0.77     19120
   macro avg       0.69      0.77      0.70     19120
weighted avg       0.83      0.77      0.79     19120

Updated Confusion Matrix:
[[11785  3544]
 [  896  2895]]
Accuracy: 0.77
Precision: 0.83
Recall: 0.77
F1 Score: 0.79
