In [1]:
def concatenate_dataframes(dfs):
    """
    Concatenates a list of DataFrames based on the 'filename' column as the ID.
    Drops the 'Prediction' column from each DataFrame and returns all column names except 'filename'.

    Parameters:
        dfs (list): List of pandas DataFrames to concatenate. Each must have a 'filename' column.

    Returns:
        pd.DataFrame: A single DataFrame with merged data.
        list: List of all column names excluding 'filename'.
    """
    if not dfs:
        raise ValueError("The list of DataFrames is empty.")

    for df in dfs:
        if 'FileName' not in df.columns:
            raise ValueError("Each DataFrame must contain a 'filename' column.")

    # Drop 'Prediction' column from each DataFrame if it exists
    dfs = [df.drop(columns=['Prediction'], errors='ignore') for df in dfs]

    # Start with the first DataFrame
    result = dfs[0].copy()
    
    for idx, df in enumerate(dfs[1:], start=1):
        # Rename columns to add a suffix except for 'filename'
        renamed_df = df.rename(columns={col: f"{col}_exp{idx}" for col in df.columns if col != 'FileName'})
        
        # Merge with the existing result
        result = pd.merge(result, renamed_df, on='FileName', how='outer')

    # Get all column names except 'filename'
    column_names = [col for col in result.columns if col != 'FileName']

    return result, column_names

In [2]:
import pandas as pd
import numpy as np

In [9]:
experiments_list = [
    'multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4_head1',
    'multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4_head1',
    'multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4_1head',
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_head1',
    'multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4_head1',
    "multimodalmodel_whisperlarge_debertaxxlarge_lazyload_lr1e5_lasthidden_head1",
    "multimodalmodel_whisperlarge_hubertxlarge_lazyload_lr1e4_head1"
]

dfs = []

for exp in experiments_list:
    
    dfs.append(pd.read_csv(f'../experiments/{exp}/results/test.csv'))
# preds.head()

In [10]:
preds, cols = concatenate_dataframes(dfs)

In [11]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_6_prob_exp5,class_7_prob_exp5,class_0_prob_exp6,class_1_prob_exp6,class_2_prob_exp6,class_3_prob_exp6,class_4_prob_exp6,class_5_prob_exp6,class_6_prob_exp6,class_7_prob_exp6
0,MSP-PODCAST_test3_0001.wav,-1.7899,3.9195,-0.8753,-0.7989,-0.5224,-0.7707,-0.0343,0.1701,-2.0672,...,-0.3431,1.1875,-1.1112,3.8106,0.5372,-1.6856,-0.5668,-1.4441,0.0636,1.6509
1,MSP-PODCAST_test3_0002.wav,0.3811,-1.1263,-0.6913,-0.3894,-0.4613,-0.3438,0.9502,0.6184,0.0798,...,0.9238,0.8029,1.7396,-1.5681,0.0894,-0.0876,-3.7914,-0.4832,2.4011,0.2819
2,MSP-PODCAST_test3_0003.wav,-2.5936,4.6975,-0.9418,-0.6833,-0.8891,-0.6856,-0.4373,2.1556,-3.5373,...,-0.682,2.9884,-3.5907,4.0058,-0.7071,-1.8682,0.1823,-2.1827,-0.532,2.5335
3,MSP-PODCAST_test3_0004.wav,-0.0474,1.2507,-1.0941,-0.5737,-1.4338,0.1314,1.0995,0.1819,-0.4396,...,0.5,0.046,-0.2366,0.7377,-1.4704,1.3243,-1.6718,-0.2187,1.1227,-0.0456
4,MSP-PODCAST_test3_0005.wav,-2.6105,0.9839,0.7885,0.6326,-0.2861,-0.8477,-0.1272,1.997,-2.7698,...,0.502,1.4913,-3.1744,1.6253,1.0291,1.0523,0.0026,-3.3462,-0.4237,2.0902


In [12]:
import joblib
loaded_model = joblib.load('../submissions/svm_model_stackingv2_augmented.pkl')

In [13]:
predictions = loaded_model.predict(preds[cols])

In [14]:
preds['Prediction'] = predictions

In [15]:
# # preds = get_probability_vote(dfs)
# preds = get_probability_vote_softmax(dfs)
classes_ = ['A', 'S', 'H', 'U', 'F', 'D', 'C', 'N']

map_argmax = dict()
for i, c in enumerate(classes_):
    map_argmax[i] = c

preds["EmoClass"] = preds["Prediction"].map(map_argmax) 

In [16]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_0_prob_exp6,class_1_prob_exp6,class_2_prob_exp6,class_3_prob_exp6,class_4_prob_exp6,class_5_prob_exp6,class_6_prob_exp6,class_7_prob_exp6,Prediction,EmoClass
0,MSP-PODCAST_test3_0001.wav,-1.7899,3.9195,-0.8753,-0.7989,-0.5224,-0.7707,-0.0343,0.1701,-2.0672,...,-1.1112,3.8106,0.5372,-1.6856,-0.5668,-1.4441,0.0636,1.6509,1,S
1,MSP-PODCAST_test3_0002.wav,0.3811,-1.1263,-0.6913,-0.3894,-0.4613,-0.3438,0.9502,0.6184,0.0798,...,1.7396,-1.5681,0.0894,-0.0876,-3.7914,-0.4832,2.4011,0.2819,6,C
2,MSP-PODCAST_test3_0003.wav,-2.5936,4.6975,-0.9418,-0.6833,-0.8891,-0.6856,-0.4373,2.1556,-3.5373,...,-3.5907,4.0058,-0.7071,-1.8682,0.1823,-2.1827,-0.532,2.5335,1,S
3,MSP-PODCAST_test3_0004.wav,-0.0474,1.2507,-1.0941,-0.5737,-1.4338,0.1314,1.0995,0.1819,-0.4396,...,-0.2366,0.7377,-1.4704,1.3243,-1.6718,-0.2187,1.1227,-0.0456,1,S
4,MSP-PODCAST_test3_0005.wav,-2.6105,0.9839,0.7885,0.6326,-0.2861,-0.8477,-0.1272,1.997,-2.7698,...,-3.1744,1.6253,1.0291,1.0523,0.0026,-3.3462,-0.4237,2.0902,7,N


In [23]:
preds["EmoClass"].value_counts(normalize=True)

EmoClass
A    0.284687
S    0.145000
H    0.123750
D    0.107188
U    0.103438
N    0.100312
C    0.070938
F    0.064687
Name: proportion, dtype: float64

In [18]:
preds = preds[['FileName','EmoClass']]

In [19]:
baseline = pd.read_csv("../submissions/baseline_reprod_cat.csv")

In [20]:
baseline["EmoClass"].value_counts(normalize=True)

EmoClass
A    0.314688
S    0.188438
H    0.141563
C    0.125000
N    0.122813
D    0.057813
U    0.040313
F    0.009375
Name: proportion, dtype: float64

In [21]:
baseline.columns, preds.columns

(Index(['FileName', 'EmoClass'], dtype='object'),
 Index(['FileName', 'EmoClass'], dtype='object'))

In [22]:
(baseline.FileName == preds.FileName).sum()

3200

In [24]:
preds = preds.sort_values(by='FileName').reset_index(drop = True)
preds.to_csv("../submissions/bimodal_ensemble_v2_augmented.csv", index=False)