In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import pickle
import joblib
import os

In [2]:
# Ensure the output directory exists
data = pd.read_csv("output/combined_output.csv")  # Replace with your dataset

# Drop columns if all values in 'EAR', 'MAR', 'Pupil Circularity', or 'MOE' are zero
# Check if there are any 0 values in the specified columns
numeric_cols = ['EAR', 'MAR', 'Pupil Circularity', 'MOE']
for col in numeric_cols:
    data[col] = data[col].replace(0, np.nan)
data = data.dropna(subset=numeric_cols)
X = data.drop(["State","Time","Video"], axis=1)  # Replace 'label' with your target column
# Encode the target column
y = data["State"]  # Replace 'State' with your target column
# Scale the features 
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:


# # Define the parameter grid for XGBoost
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [2, 3, 4, 5],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0]
# }

# xgb = XGBClassifier(eval_metric="logloss", random_state=42)

# # GridSearchCV for hyperparameter tuning
# grid_search = GridSearchCV(
#     estimator=xgb,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )

# grid_search.fit(X_train, y_train)

# print("Best parameters found: ", grid_search.best_params_)
# print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# # Evaluate on test set
# best_xgb = grid_search.best_estimator_
# y_pred = best_xgb.predict(X_test)

# print("Test Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
# print("F1 Score: {:.4f}".format(f1_score(y_test, y_pred, average="weighted")))
# print("Recall: {:.4f}".format(recall_score(y_test, y_pred, average="weighted")))
# print("Precision: {:.4f}".format(precision_score(y_test, y_pred, average="weighted")))
# print(classification_report(y_test, y_pred))

# # Save the best model
# with open("xgb_best_model.pkl", "wb") as f:
#     pickle.dump(best_xgb, f)
# print("Best XGBoost model saved as xgb_best_model.pkl")

In [4]:
# Define models and parameter grids
models = {
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7]}),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), {"max_depth": [2, 4, 6, 8]}),
    "Random Forest": (RandomForestClassifier(random_state=42), {"n_estimators": [50, 100], "max_depth": [4, 6, 8]}),
    "SVM": (SVC(probability=True, random_state=42), {"kernel": ["linear", "rbf"], "C": [0.1, 1, 10]}),
    "XGBoost": (XGBClassifier(eval_metric="logloss", random_state=42), {
        'n_estimators': [50, 100, 200],
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0]
    }),
}

In [5]:
results = []
for name, (model, param_grid) in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(
        model, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    # Calculate inference time
    start_time = time.time()
    _ = best_model.predict(X_test)
    total_inference_time = time.time() - start_time
    inference_time_per_sample = total_inference_time / len(X_test)

    print(f"Best Params: {grid.best_params_}")
    print(
        f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}"
    )
    print(classification_report(y_test, y_pred))
    # Save model
    with open(f"model/{name.replace(' ', '_').lower()}_model.pkl", "wb") as f:
        pickle.dump(best_model, f)
    results.append(
        {
            "Model": name,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "Inference Time (s)": inference_time_per_sample,
            "Best Params": grid.best_params_,
        }
    )

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("model/model_comparison.csv", index=False)
print("\nModel comparison saved as model/model_comparison.csv")


Training KNN...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Params: {'n_neighbors': 3}
Accuracy: 0.9740, F1: 0.9739, Recall: 0.9740, Precision: 0.9738
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       481
           1       0.94      0.91      0.92        97

    accuracy                           0.97       578
   macro avg       0.96      0.95      0.95       578
weighted avg       0.97      0.97      0.97       578


Training Decision Tree...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Params: {'max_depth': 6}
Accuracy: 0.9671, F1: 0.9671, Recall: 0.9671, Precision: 0.9670
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       481
           1       0.91      0.90      0.90        97

    accuracy                           0.97       578
   macro avg       0.94      0.94      0.94       578
weighted avg       0.97      0.97      0.97    