In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from modules.utils import load_data, load_config, test_model
from modules.trainer_tester import TrainerTester

from modules.monash_data_pipeline import GeneralizationTestingDataPipeline 

from models.au_mfcc.model import DepressionDetectionModel as au_mfcc_model
from models.au_only.model import DepressionDetectionModel as au_model
from models.mfcc_only.model import DepressionDetectionModel as mfcc_model

from mlxtend.evaluate import mcnemar_table
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.metrics import classification_report, confusion_matrix

# Loading AU-MFCC Paired Data

The dataloaders here will be used for the various modalities moving forward for consistency.

This means that there will be some discrepencies for the individual (AU only and MFCC only) modalities. Such as having a few less samples.

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load AU-MFCC config (include gender)
config = load_config("./models/au_mfcc/logs/config.json")
config["DataPipeline"]["include_gender"] = True

**DAIC-WOZ Dataset**

In [3]:
# Load DAIC-WOZ Data
dw_dataloaders, split_dfs = load_data(config)

cat_0 = 0
cat_1 = 0

for sample_no in range(len(dw_dataloaders["test"].dataset)):
    if dw_dataloaders["test"].dataset[sample_no]["Category"] == 0:
        cat_0 += 1
    else:
        cat_1 += 1

print("\nDAIC-WOZ Data Distribution")
print(f"0:{cat_0}, 1:{cat_1}")
print(f"ratio: {cat_0/cat_1}")


Reading DAIC-WOZ Data
Unsuccessful frames in data to be removed: 3.835461318536759%

Unique videos: 189
Total processed frames: 2362555
Avg frames per video: 12500.291005291005
Memory used: 0.5368733964860439 GB

Preparing Dataloader
Preparing Data


Train: 100%|██████████| 107/107 [00:28<00:00,  3.70it/s]
Val: 100%|██████████| 35/35 [00:09<00:00,  3.60it/s]
Test: 100%|██████████| 47/47 [00:13<00:00,  3.61it/s]


DAIC-WOZ Data Distribution
0:240, 1:93
ratio: 2.5806451612903225





**Behavioural Dataset (For Further Generalization Testing)**

In [4]:
# Load Behavioural dataset
daic_woz_train_split_df = pd.read_csv("../data/au_mfcc/DAIC-WOZ_Participant_Voiced/train_split_Depression_AVEC2017.csv")

gen_data_pipeline = GeneralizationTestingDataPipeline(
    au_dir="../data/au_mfcc/OpenFaceAnnotations_Participant_Voiced/",
    mfcc_dir="../data/au_mfcc/MFCCAnnotations/60_60_60_60/", 
    keep_AU_cols=config["DAIC_WOZ_READER"]["keep_AU_cols"], 
    daic_woz_train_split_df=daic_woz_train_split_df,
    au_separate=config["DataPipeline"]["au_separate"],
    au_fixed_length=config["DataPipeline"]["au_fixed_length"],
    mfcc_fixed_length=config["DataPipeline"]["mfcc_fixed_length"],
    daic_woz_mfcc_dir=config["DataPipeline"]["mfcc_dir"],
    segment_config=config["DataPipeline"]["segment_config"],
    batch_size=config["DataPipeline"]["batch_size"]
)
    
gen_dataloaders = gen_data_pipeline.dataloaders

cat_0 = 0
cat_1 = 0

for sample_no in range(len(gen_dataloaders["test"].dataset)):
    if gen_dataloaders["test"].dataset[sample_no]["Category"] == 0:
        cat_0 += 1
    else:
        cat_1 += 1

print("\nBehavioural Dataset Data Distribution")
print(f"0:{cat_0}, 1:{cat_1}")
print(f"ratio: {cat_0/cat_1}")

Preparing Data


TEST: 100%|██████████| 7/7 [00:01<00:00,  5.37it/s]


Behavioural Dataset Data Distribution
0:15, 1:32
ratio: 0.46875





**Testing function**

In [5]:
def test(model, version):
    # Input Adapter
    def input_adapter(batch: dict) -> dict:
        inputs = {}
        if "au" in version.split("_"):
            inputs['au_input'] = batch['AUs'].to(device)
        if "mfcc" in version.split("_"):
            inputs['mfcc_input'] = batch['MFCCs'].to(device)  
        return inputs

    # Loss function
    criterion = nn.BCEWithLogitsLoss()

    # DAIC
    print("DAIC-WOZ TEST SPLIT DATA")
    # Load model
    trainer = TrainerTester(model=model, dataloaders=dw_dataloaders, device=device, criterion=criterion, optimizer=None, input_adapter=input_adapter)
    # Test model
    test_metrics = trainer.test(f"./models/{version}/checkpoints/final_model.pth")
    print("\nConfusion Matrix\n", test_metrics["confusion_matrix"])
    print("\nClassification Report\n", test_metrics["classification_report"], "\n")


    # Monash
    print("BEHAVOIURAL DATASET DATA")
    # Load model
    trainer = TrainerTester(model=model, dataloaders=gen_dataloaders, device=device, criterion=criterion, optimizer=None, input_adapter=input_adapter)
    # Test model
    test_metrics = trainer.test(f"./models/{version}/checkpoints/final_model.pth")
    print("\nConfusion Matrix\n", test_metrics["confusion_matrix"])
    print("\nClassification Report\n", test_metrics["classification_report"])

# AU_MFCC

In [6]:
# Model
config = load_config("./models/au_mfcc/logs/config.json")
au_mfcc_model_trained = au_mfcc_model(**config["Model"])

test(au_mfcc_model_trained, "au_mfcc")

DAIC-WOZ TEST SPLIT DATA
Loaded model states from ./models/au_mfcc/checkpoints/final_model.pth


  checkpoint = torch.load(path, map_location=self.device)



Confusion Matrix
 [[228  12]
 [ 77  16]]

Classification Report
               precision    recall  f1-score   support

           0       0.75      0.95      0.84       240
           1       0.57      0.17      0.26        93

    accuracy                           0.73       333
   macro avg       0.66      0.56      0.55       333
weighted avg       0.70      0.73      0.68       333
 

BEHAVOIURAL DATASET DATA
Loaded model states from ./models/au_mfcc/checkpoints/final_model.pth

Confusion Matrix
 [[ 5 10]
 [10 22]]

Classification Report
               precision    recall  f1-score   support

           0       0.33      0.33      0.33        15
           1       0.69      0.69      0.69        32

    accuracy                           0.57        47
   macro avg       0.51      0.51      0.51        47
weighted avg       0.57      0.57      0.57        47



  checkpoint = torch.load(path, map_location=self.device)


# AU Only

No missing samples when using AU features from AU-MFCC paired version.

In [7]:
# Model
config = load_config("./models/au_only/logs/config.json")
au_model_trained = au_model(**config["Model"])

test(au_model_trained, "au_only")

DAIC-WOZ TEST SPLIT DATA
Loaded model states from ./models/au_only/checkpoints/final_model.pth


  checkpoint = torch.load(path, map_location=self.device)



Confusion Matrix
 [[182  58]
 [ 73  20]]

Classification Report
               precision    recall  f1-score   support

           0       0.71      0.76      0.74       240
           1       0.26      0.22      0.23        93

    accuracy                           0.61       333
   macro avg       0.49      0.49      0.48       333
weighted avg       0.59      0.61      0.60       333
 

BEHAVOIURAL DATASET DATA
Loaded model states from ./models/au_only/checkpoints/final_model.pth

Confusion Matrix
 [[13  2]
 [27  5]]

Classification Report
               precision    recall  f1-score   support

           0       0.33      0.87      0.47        15
           1       0.71      0.16      0.26        32

    accuracy                           0.38        47
   macro avg       0.52      0.51      0.36        47
weighted avg       0.59      0.38      0.33        47



  checkpoint = torch.load(path, map_location=self.device)


# MFCC Only

Note: Has one less depressed sample because of dataloader pairing in AU-MFCC vs MFCC only

In [8]:
# Model
config = load_config("./models/mfcc_only/logs/config.json")
mfcc_model_trained = mfcc_model(**config["Model"])

test(mfcc_model_trained, "mfcc_only")

DAIC-WOZ TEST SPLIT DATA
Loaded model states from ./models/mfcc_only/checkpoints/final_model.pth


  checkpoint = torch.load(path, map_location=self.device)



Confusion Matrix
 [[194  46]
 [ 70  23]]

Classification Report
               precision    recall  f1-score   support

           0       0.73      0.81      0.77       240
           1       0.33      0.25      0.28        93

    accuracy                           0.65       333
   macro avg       0.53      0.53      0.53       333
weighted avg       0.62      0.65      0.63       333
 

BEHAVOIURAL DATASET DATA
Loaded model states from ./models/mfcc_only/checkpoints/final_model.pth

Confusion Matrix
 [[ 4 11]
 [ 8 24]]

Classification Report
               precision    recall  f1-score   support

           0       0.33      0.27      0.30        15
           1       0.69      0.75      0.72        32

    accuracy                           0.60        47
   macro avg       0.51      0.51      0.51        47
weighted avg       0.57      0.60      0.58        47



  checkpoint = torch.load(path, map_location=self.device)


# Statistical Tests

In [9]:
def model_predictions(model, version, dataloaders):
    # Input Adapter
    def input_adapter(batch: dict) -> dict:
        inputs = {}
        if "au" in version.split("_"):
            inputs['au_input'] = batch['AUs'].to(device)
        if "mfcc" in version.split("_"):
            inputs['mfcc_input'] = batch['MFCCs'].to(device)  
        return inputs

    # Loss function
    criterion = nn.BCEWithLogitsLoss()

    # Load model
    trainer = TrainerTester(model=model, dataloaders=dataloaders, device=device, criterion=criterion, optimizer=None, input_adapter=input_adapter)
    # Get predictions
    predictions = trainer.test_predictions()

    return predictions


def mcnemear_test(true_labels, model_1_preds, model_2_preds):
    contingency_table = mcnemar_table(y_target=true_labels, 
                                    y_model1=model_1_preds, 
                                    y_model2=model_2_preds)

    result = mcnemar(contingency_table, exact=True)

    return result.pvalue


def performance_comparison(trained_models, versions, dataloaders):
    # Get true labels and predictions of each model
    true_labels = None
    predictions = []
    for i in range(len(versions)):
        preds, labels = model_predictions(trained_models[i], versions[i], dataloaders)

        if true_labels is None:
            true_labels = np.array(labels)

        predictions.append(np.array(preds))

    # Perform McNemar test on  pair-wise combinations of models
    for i in range(len(versions)):
        for j in range(i+1, len(versions)):
            p_value = mcnemear_test(true_labels, predictions[i], predictions[j])
            print(f"p-value of {versions[i]} & {versions[j]}: {p_value}")


In [10]:
versions = ["au_only", "mfcc_only", "au_mfcc"]
trained_models = [au_model_trained, mfcc_model_trained, au_mfcc_model_trained]

print("DAIC-WOZ Dataset McNemar Test")
performance_comparison(trained_models, versions, dw_dataloaders)

DAIC-WOZ Dataset McNemar Test
p-value of au_only & mfcc_only: 0.18759872474577577
p-value of au_only & au_mfcc: 1.0902049191089246e-05
p-value of mfcc_only & au_mfcc: 0.0001980394513374506


In [11]:
print("Behavioural Dataset McNemar Test")
performance_comparison(trained_models, versions, gen_dataloaders)

Behavioural Dataset McNemar Test
p-value of au_only & mfcc_only: 0.09873714670538905
p-value of au_only & au_mfcc: 0.12207812070846558
p-value of mfcc_only & au_mfcc: 1.0


# Gender Analysis

In [28]:
genders = []

for sample_no in range(len(dw_dataloaders["test"].dataset)):
    genders.append(dw_dataloaders["test"].dataset[sample_no]["Gender"])

genders = np.array(genders)

In [29]:
preds, labels = model_predictions(trained_models[2], versions[2], dw_dataloaders)
preds = np.array(preds)
labels = np.array(labels)

male_idx = np.where(genders == 0)[0]
male_preds = preds[male_idx]
male_labels = labels[male_idx]

female_idx = np.where(genders == 1)[0]
female_preds = preds[female_idx]
female_labels = labels[female_idx]

In [33]:
# Males
cm = confusion_matrix(male_labels, male_preds)
report = classification_report(male_labels, male_preds)

print("\nConfusion Matrix\n", cm)
print("\nClassification Report\n", report)



Confusion Matrix
 [[132   7]
 [ 40  16]]

Classification Report
               precision    recall  f1-score   support

           0       0.77      0.95      0.85       139
           1       0.70      0.29      0.41        56

    accuracy                           0.76       195
   macro avg       0.73      0.62      0.63       195
weighted avg       0.75      0.76      0.72       195



In [32]:
# Females
cm = confusion_matrix(female_labels, female_preds)
report = classification_report(female_labels, female_preds)

print("\nConfusion Matrix\n", cm)
print("\nClassification Report\n", report)


Confusion Matrix
 [[96  5]
 [37  0]]

Classification Report
               precision    recall  f1-score   support

           0       0.72      0.95      0.82       101
           1       0.00      0.00      0.00        37

    accuracy                           0.70       138
   macro avg       0.36      0.48      0.41       138
weighted avg       0.53      0.70      0.60       138

