In [7]:
import os
import glob
import pandas as pd

In [8]:
daic_dir = "../datasets/DAIC-WOZ/"
edaic_dir = "../datasets/EDAIC-WOZ/"    

In [9]:
csv_files = glob.glob(os.path.join(daic_dir, "*.csv"))
daic_dfs = []

for csv_file in csv_files:
    filename = os.path.basename(csv_file)
    if filename == "test_split_Depression_AVEC2017.csv":
        continue
    
    df = pd.read_csv(csv_file)
    
    # Handle different column names
    phq_binary_col = 'PHQ8_Binary'
    if phq_binary_col in df.columns:
        phq_score_col = 'PHQ8_Score'
    elif 'PHQ_Binary' in df.columns:
        phq_binary_col = 'PHQ_Binary'
        phq_score_col = 'PHQ_Score'
    
    # Check for inconsistent labels (PHQ_Score >= 10 but Binary = 0)
    if phq_score_col in df.columns:
        inconsistent_mask = (df[phq_score_col] >= 10) & (df[phq_binary_col] == 0)
        inconsistent_count = inconsistent_mask.sum()
        
        if inconsistent_count > 0:
            print(f"Found {inconsistent_count} inconsistent labels in {filename}")
            inconsistent_participants = df[inconsistent_mask]['Participant_ID'].tolist()
            for pid in inconsistent_participants:
                score = df[df['Participant_ID'] == pid][phq_score_col].iloc[0]
                print(f"  Participant {pid}: {phq_score_col}={score}, {phq_binary_col}=0 -> fixing to 1")
            
            # Fix inconsistent labels
            df.loc[inconsistent_mask, phq_binary_col] = 1
    
    # Keep only Participant_ID and PHQ_Binary columns
    # Standardize column name to PHQ_Binary
    df_subset = df[['Participant_ID', phq_binary_col]].copy()
    df_subset.rename(columns={phq_binary_col: 'PHQ_Binary'}, inplace=True)
    daic_dfs.append(df_subset)

# Concatenate all DAIC-WOZ data
if daic_dfs:
    daic_combined = pd.concat(daic_dfs, ignore_index=True)
    # Remove duplicates (keep first occurrence)
    daic_combined = daic_combined.drop_duplicates(subset=['Participant_ID'], keep='first')
    print(f"DAIC-WOZ combined data shape: {daic_combined.shape}")
    print(f"PHQ_Binary distribution in DAIC-WOZ:\n{daic_combined['PHQ_Binary'].value_counts()}")

edaic_csv_path = os.path.join(edaic_dir, "all_data.csv")
if os.path.exists(edaic_csv_path):
    edaic_df = pd.read_csv(edaic_csv_path)
    # Keep only Participant_ID and PHQ_Binary columns
    edaic_subset = edaic_df[['Participant_ID', 'PHQ_Binary']].copy()
    print(f"EDAIC-WOZ data shape: {edaic_subset.shape}")
    print(f"PHQ_Binary distribution in EDAIC-WOZ:\n{edaic_subset['PHQ_Binary'].value_counts()}")

# Merge both datasets
print("\nMerging datasets...")
final_combined = pd.concat([daic_combined, edaic_subset], ignore_index=True)

# Rinomina la colonna in PHQ8_Binary
final_combined.rename(columns={'PHQ_Binary': 'PHQ8_Binary'}, inplace=True)

print(f"\nFinal combined dataset shape: {final_combined.shape}")
print(f"Final PHQ8_Binary distribution:\n{final_combined['PHQ8_Binary'].value_counts()}")

# Save the merged dataset
output_path = "../datasets/dataset.csv"
final_combined.to_csv(output_path, index=False)
print(f"\nMerged dataset saved to: {output_path}")

Found 1 inconsistent labels in train_split_Depression_AVEC2017.csv
  Participant 409: PHQ8_Score=10, PHQ8_Binary=0 -> fixing to 1
DAIC-WOZ combined data shape: (189, 2)
PHQ_Binary distribution in DAIC-WOZ:
PHQ_Binary
0    132
1     57
Name: count, dtype: int64
EDAIC-WOZ data shape: (29, 2)
PHQ_Binary distribution in EDAIC-WOZ:
PHQ_Binary
1    29
Name: count, dtype: int64

Merging datasets...

Final combined dataset shape: (218, 2)
Final PHQ8_Binary distribution:
PHQ8_Binary
0    132
1     86
Name: count, dtype: int64

Merged dataset saved to: ../datasets/dataset.csv
