In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import os

In [2]:
def get_video_files(directory):
    video_files = []
    subfolders = ['deceptive', 'truthful']
    for subfolder in subfolders:
        path = os.path.join(directory, subfolder)
        files = [os.path.join(subfolder, f) for f in os.listdir(path) if f.endswith('_metrics.csv')]
        video_files.extend(files)
    return video_files

In [33]:
def load_data_from_files(file_list, directory):
    data_frames = []
    for file in file_list:
        path = os.path.join(directory, file)
        df = pd.read_csv(path)
        pd.get_dummies(df['Emotion']) # Must convert to one-hot encoding here to avoid issues with missing values/columns
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [34]:
video_metric_directory = '../video_metrics/'
video_files = get_video_files(video_metric_directory)
train_files, test_files = train_test_split(video_files, test_size=0.2, random_state=42)

train_data = load_data_from_files(train_files, video_metric_directory)
test_data = load_data_from_files(test_files, video_metric_directory)

  return pd.concat(data_frames, ignore_index=True)
  return pd.concat(data_frames, ignore_index=True)


In [35]:
train_data.drop(columns=['Video_Path'], inplace=True)
test_data.drop(columns=['Video_Path'], inplace=True)

train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(test_data.median())

  train_data = train_data.fillna(train_data.median())
  test_data = test_data.fillna(test_data.median())


In [36]:
X_train = train_data.drop(columns=['Label'])
y_train = train_data['Label']
X_test = test_data.drop(columns=['Label'])
y_test = test_data['Label']

# XGBoost

In [37]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [48]:
# Define the parameters to search using linspace for continuous ranges
params = {
    'max_depth': np.arange(3, 20),
    'gamma': np.linspace(0.1, 5, 20),
    'subsample': np.linspace(0.5, 1.0, 11),
    'colsample_bytree': np.linspace(0.5, 1.0, 11),
    'n_estimators': np.linspace(100, 1000, 10).astype(int),
    'learning_rate': np.linspace(0.01, 0.5, 20)
}

# Setup the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=xgb_model, param_distributions=params, n_iter=10, scoring='accuracy', cv=4, verbose=2,  n_jobs=7)

In [49]:
# Fit the random search model
random_cv.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


In [50]:
# Best parameters and score
print("Best parameters:", random_cv.best_params_)
print("Best accuracy:", random_cv.best_score_)

# Evaluate on the test set
print("Test set accuracy:", random_cv.score(X_test, y_test))

# Best parameters: {'subsample': 0.95, 'n_estimators': 900, 'max_depth': 10, 'learning_rate': 0.3452631578947368, 'gamma': 4.742105263157895, 'colsample_bytree': 0.6}
# Best accuracy: 0.617920060158935
# Test set accuracy: 0.5191781529089876

Best parameters: {'subsample': 0.95, 'n_estimators': 900, 'max_depth': 10, 'learning_rate': 0.3452631578947368, 'gamma': 4.742105263157895, 'colsample_bytree': 0.6}
Best accuracy: 0.617920060158935
Test set accuracy: 0.5191781529089876


In [51]:
# Evaluate the best model on the test set
best_model = random_cv.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Best accuracy:", random_cv.best_score_)
print("Test set accuracy:", accuracy)
print("Classification Report:\n", class_report)

Best accuracy: 0.617920060158935
Test set accuracy: 0.5191781529089876
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.31      0.40     10063
           1       0.50      0.74      0.60      9308

    accuracy                           0.52     19371
   macro avg       0.53      0.53      0.50     19371
weighted avg       0.53      0.52      0.50     19371



In [17]:
import pickle
best_model = random_cv.best_estimator_

filename = 'finalized_xgb_classifier.pkl'
pickle.dump(best_model, open(filename, 'wb'))
