In [19]:
def concatenate_dataframes(dfs):
    """
    Concatenates a list of DataFrames based on the 'filename' column as the ID.
    Drops the 'Prediction' column from each DataFrame and returns all column names except 'filename'.

    Parameters:
        dfs (list): List of pandas DataFrames to concatenate. Each must have a 'filename' column.

    Returns:
        pd.DataFrame: A single DataFrame with merged data.
        list: List of all column names excluding 'filename'.
    """
    if not dfs:
        raise ValueError("The list of DataFrames is empty.")

    for df in dfs:
        if 'FileName' not in df.columns:
            raise ValueError("Each DataFrame must contain a 'filename' column.")

    # Drop 'Prediction' column from each DataFrame if it exists
    dfs = [df.drop(columns=['Prediction'], errors='ignore') for df in dfs]

    # Start with the first DataFrame
    result = dfs[0].copy()
    
    for idx, df in enumerate(dfs[1:], start=1):
        # Rename columns to add a suffix except for 'filename'
        renamed_df = df.rename(columns={col: f"{col}_exp{idx}" for col in df.columns if col != 'FileName'})
        
        # Merge with the existing result
        result = pd.merge(result, renamed_df, on='FileName', how='outer')

    # Get all column names except 'filename'
    column_names = [col for col in result.columns if col != 'FileName']

    return result, column_names

In [20]:
import pandas as pd
import numpy as np

In [21]:
experiments_list = [
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4',
    'multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4',
    'multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4',
    'multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4',
    'multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4',
    "multimodalmodel_whisperlarge_debertaxxlarge_lazyload_lr1e5_lasthidden"
]

dfs = []

for exp in experiments_list:
    
    dfs.append(pd.read_csv(f'../experiments/{exp}/results/test.csv'))
# preds.head()

In [23]:
preds, cols = concatenate_dataframes(dfs)

In [24]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_6_prob_exp4,class_7_prob_exp4,class_0_prob_exp5,class_1_prob_exp5,class_2_prob_exp5,class_3_prob_exp5,class_4_prob_exp5,class_5_prob_exp5,class_6_prob_exp5,class_7_prob_exp5
0,MSP-PODCAST_test3_0001.wav,-1.8813,3.5432,-1.2339,-1.8614,-3.4532,-3.3043,-1.1342,0.7817,-1.2326,...,-0.328,1.6367,-2.0023,4.4295,-0.7834,-1.5871,-0.2586,-1.4419,-0.47,1.4294
1,MSP-PODCAST_test3_0002.wav,1.8304,-3.2814,0.2281,-0.9293,-3.8059,-1.8861,0.4103,-0.38,-0.4372,...,1.6774,0.8535,0.8554,-1.9537,-0.4652,-0.2294,-0.5644,-0.4889,0.8518,0.4615
2,MSP-PODCAST_test3_0003.wav,-3.0854,2.3483,-1.0679,-1.4489,-2.3664,-1.4042,-0.8041,1.3083,-2.369,...,-0.8434,3.3221,-3.1085,4.2384,-0.5246,-1.54,-1.1229,-1.5478,-0.3813,3.0865
3,MSP-PODCAST_test3_0004.wav,0.3526,-0.0366,-0.5932,-0.4186,-1.2171,-0.0219,0.4367,-0.7974,-0.191,...,0.5284,0.1292,-0.0101,1.8763,-1.3953,0.1967,-1.8027,-0.5087,0.396,0.7764
4,MSP-PODCAST_test3_0005.wav,-1.8076,-0.4622,0.0495,0.1923,-0.4019,-0.7079,0.0873,1.0332,-2.369,...,-0.4595,1.5982,-2.5254,1.6662,0.019,0.2076,-0.6646,-1.256,0.1093,1.6726


In [26]:
import joblib
loaded_model = joblib.load('../submissions/random_forest_model_stackingv1.pkl')

In [27]:
predictions = loaded_model.predict(preds[cols])

In [28]:
preds['Prediction'] = predictions

In [29]:
# # preds = get_probability_vote(dfs)
# preds = get_probability_vote_softmax(dfs)
classes_ = ['A', 'S', 'H', 'U', 'F', 'D', 'C', 'N']

map_argmax = dict()
for i, c in enumerate(classes_):
    map_argmax[i] = c

preds["EmoClass"] = preds["Prediction"].map(map_argmax) 

In [30]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_0_prob_exp5,class_1_prob_exp5,class_2_prob_exp5,class_3_prob_exp5,class_4_prob_exp5,class_5_prob_exp5,class_6_prob_exp5,class_7_prob_exp5,Prediction,EmoClass
0,MSP-PODCAST_test3_0001.wav,-1.8813,3.5432,-1.2339,-1.8614,-3.4532,-3.3043,-1.1342,0.7817,-1.2326,...,-2.0023,4.4295,-0.7834,-1.5871,-0.2586,-1.4419,-0.47,1.4294,1,S
1,MSP-PODCAST_test3_0002.wav,1.8304,-3.2814,0.2281,-0.9293,-3.8059,-1.8861,0.4103,-0.38,-0.4372,...,0.8554,-1.9537,-0.4652,-0.2294,-0.5644,-0.4889,0.8518,0.4615,7,N
2,MSP-PODCAST_test3_0003.wav,-3.0854,2.3483,-1.0679,-1.4489,-2.3664,-1.4042,-0.8041,1.3083,-2.369,...,-3.1085,4.2384,-0.5246,-1.54,-1.1229,-1.5478,-0.3813,3.0865,1,S
3,MSP-PODCAST_test3_0004.wav,0.3526,-0.0366,-0.5932,-0.4186,-1.2171,-0.0219,0.4367,-0.7974,-0.191,...,-0.0101,1.8763,-1.3953,0.1967,-1.8027,-0.5087,0.396,0.7764,1,S
4,MSP-PODCAST_test3_0005.wav,-1.8076,-0.4622,0.0495,0.1923,-0.4019,-0.7079,0.0873,1.0332,-2.369,...,-2.5254,1.6662,0.019,0.2076,-0.6646,-1.256,0.1093,1.6726,7,N


In [32]:
preds["EmoClass"].value_counts()

EmoClass
A    658
S    601
H    450
N    382
U    371
D    317
F    214
C    207
Name: count, dtype: int64

In [37]:
preds = preds[['FileName','EmoClass']]

In [38]:
baseline = pd.read_csv("../submissions/baseline_reprod_cat.csv")

In [39]:
baseline["EmoClass"].value_counts(normalize=True)

EmoClass
A    0.314688
S    0.188438
H    0.141563
C    0.125000
N    0.122813
D    0.057813
U    0.040313
F    0.009375
Name: proportion, dtype: float64

In [40]:
baseline.columns, preds.columns

(Index(['FileName', 'EmoClass'], dtype='object'),
 Index(['FileName', 'EmoClass'], dtype='object'))

In [41]:
(baseline.FileName == preds.FileName).sum()

3200

In [17]:
preds = preds.sort_values(by='FileName').reset_index(drop = True)
preds.to_csv("../submissions/bimodal_ensemble7.csv", index=False)