# Checking Discrepancies

The code below was roughly used to explore the difference between the total frames provided by the AU_CLNF files vs the audio recording & transcript for participants 402 & 420.

Conclusively, the AU_CLNF files for both are cut short

In [1]:
import sys
import os
import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(os.path.join('..', '..', 'depression_detection')))
from daic_woz_reader import DAIC_WOZ_READER

## Full DAIC-WOZ

In [None]:
dw_reader_full = DAIC_WOZ_READER(
                directory="../../data/DAIC-WOZ/", 
                speaking_only=False, 
                keep_AU_cols=["AU01_r", "AU02_r", "AU04_r", "AU05_r", "AU06_r", "AU09_r", "AU10_r", "AU12_r", "AU14_r", "AU15_r", "AU17_r", "AU20_r", "AU25_r", "AU26_r",
                "AU04_c", "AU12_c", "AU15_c", "AU23_c","AU28_c", "AU45_c"], 
                phq8_categories={
                    0: [0,9],
                    1: [10,24]
                },
                remove_unsuccessful=False
            )

Reading Participant ID: 492...

In [40]:
print("402 Frames Duration:", dw_reader_full.au_df[dw_reader_full.au_df["Participant_ID"] == "402"]["timestamp"].nunique() / 30)
print("402 Final Timestamp:", dw_reader_full.au_df[dw_reader_full.au_df["Participant_ID"] == "402"]["timestamp"].tail(1))
print("Diff:", str(758.8 - 833.267))

402 Frames Duration: 758.8666666666667
402 Final Timestamp: 2600742    833.267
Name: timestamp, dtype: float64
Diff: -74.4670000000001


In [41]:
print("420 Frames Duration:", dw_reader_full.au_df[dw_reader_full.au_df["Participant_ID"] == "420"]["timestamp"].nunique() / 30)
print("420 Final Timestamp:", dw_reader_full.au_df[dw_reader_full.au_df["Participant_ID"] == "420"]["timestamp"].tail(1))
print("Diff:", str(674.6 - 706.367))

420 Frames Duration: 674.6
420 Final Timestamp: 3100472    706.367
Name: timestamp, dtype: float64
Diff: -31.76699999999994


## Speaking Only

In [None]:
dw_reader = DAIC_WOZ_READER(
                directory="../../data/DAIC-WOZ/", 
                speaking_only=True, 
                keep_AU_cols=["AU01_r", "AU02_r", "AU04_r", "AU05_r", "AU06_r", "AU09_r", "AU10_r", "AU12_r", "AU14_r", "AU15_r", "AU17_r", "AU20_r", "AU25_r", "AU26_r",
                "AU04_c", "AU12_c", "AU15_c", "AU23_c","AU28_c", "AU45_c"], 
                phq8_categories={
                    0: [0,9],
                    1: [10,24]
                },
                remove_unsuccessful=False
            )

Reading Participant ID: 492...

In [14]:
split_dfs = dw_reader.split_dfs
au_df = dw_reader.au_df

In [44]:
au_df[au_df["Participant_ID"] == "402"]["timestamp"].nunique() / 30

395.8333333333333

In [16]:
au_df[au_df["Participant_ID"] == "420"]["timestamp"].nunique() / 30

275.26666666666665

## Transcript

In [None]:
df = pd.read_csv(
    "../../data/DAIC-WOZ/402_P/402_TRANSCRIPT.csv",
    sep=r"\s+",  # One or more whitespaces as the delimiter
    engine="python",
    skiprows=1,  # Skip the first header row
    header=None,
    usecols=[0, 1, 2],  # Only load the columns we care about
    names=['start_time', 'stop_time', 'speaker'],
)

times = df[df["speaker"] == "Participant"]["stop_time"] - df[df["speaker"] == "Participant"]["start_time"]
sum(times)

447.60000000000014

In [None]:
# Removing all after final timestamp from DF results in same total duration as AU df
df = df[(df["speaker"] == "Participant") & (df["start_time"] < 833.267)]

times = df[df["speaker"] == "Participant"]["stop_time"] - df[df["speaker"] == "Participant"]["start_time"]
sum(times)

395.3000000000002

In [None]:
df = pd.read_csv(
    "../../data/DAIC-WOZ/420_P/420_TRANSCRIPT.csv",
    sep=r"\s+",  # One or more whitespaces as the delimiter
    engine="python",
    skiprows=1,  # Skip the first header row
    header=None,
    usecols=[0, 1, 2],  # Only load the columns we care about
    names=['start_time', 'stop_time', 'speaker'],
)

times = df[df["speaker"] == "Participant"]["stop_time"] - df[df["speaker"] == "Participant"]["start_time"]
sum(times)

383.82999999999976

In [None]:
# Removing all after final timestamp from DF results in same total duration as AU df
df = df[(df["speaker"] == "Participant") & (df["start_time"] < 706.367)]

times = df[df["speaker"] == "Participant"]["stop_time"] - df[df["speaker"] == "Participant"]["start_time"]
sum(times)

275.19499999999977

# Comparing Processed MFCC & AU Files (with further non-voiced removal)

This extra section is to check if the extracted MFCCs match up with the corresponding AUs

In [None]:
mfcc_len = np.load(f"../../data/au_mfcc/DAIC-WOZ_MFCC/60_60_60_60/{402}_P/{402}_MFCC.npy").shape[1] / (16 + 2/3)
mfcc_len

362.21999999999997

In [None]:
mfcc_len = np.load(f"../../data/au_mfcc/DAIC-WOZ_MFCC/60_60_60_60/{420}_P/{420}_MFCC.npy").shape[1] / (16 + 2/3)
mfcc_len

250.55999999999997

In [None]:
dw_reader_participant_voiced = DAIC_WOZ_READER(
    directory="../../data/au_mfcc/DAIC-WOZ_Participant_Voiced/", 
    speaking_only=False, 
    keep_AU_cols=["AU01_r", "AU02_r", "AU04_r", "AU05_r", "AU06_r", "AU09_r", "AU10_r", "AU12_r", "AU14_r", "AU15_r", "AU17_r", "AU20_r", "AU25_r", "AU26_r",
    "AU04_c", "AU12_c", "AU15_c", "AU23_c","AU28_c", "AU45_c"], 
    phq8_categories={
        0: [0,9],
        1: [10,24]
    },
    remove_unsuccessful=False
)

au_df2 = dw_reader_participant_voiced.au_df

Reading Participant ID: 492...

In [None]:
time_diffs = []
for pid in au_df2["Participant_ID"].unique():
    au_len = au_df2[au_df2["Participant_ID"] == pid]["timestamp"].count() / 30

    mfcc_len = np.load(f"../../data/au_mfcc/DAIC-WOZ_MFCC/60_60_60_60/{pid}_P/{pid}_MFCC.npy").shape[1] / (16 + 2/3)

    diff = mfcc_len - au_len
    time_diffs.append(diff)

    if abs(diff) >= 1:
        print(pid, diff)

print("absolute mean:", np.mean([abs(t) for t in time_diffs]))
print("mean:", np.mean(time_diffs))        

absolute mean: 0.08532627865959913
mean: 0.035238095238067625
