In [1]:
def concatenate_dataframes(dfs):
    """
    Concatenates a list of DataFrames based on the 'filename' column as the ID.
    Drops the 'Prediction' column from each DataFrame and returns all column names except 'filename'.

    Parameters:
        dfs (list): List of pandas DataFrames to concatenate. Each must have a 'filename' column.

    Returns:
        pd.DataFrame: A single DataFrame with merged data.
        list: List of all column names excluding 'filename'.
    """
    if not dfs:
        raise ValueError("The list of DataFrames is empty.")

    for df in dfs:
        if 'FileName' not in df.columns:
            raise ValueError("Each DataFrame must contain a 'filename' column.")

    # Drop 'Prediction' column from each DataFrame if it exists
    dfs = [df.drop(columns=['Prediction'], errors='ignore') for df in dfs]

    # Start with the first DataFrame
    result = dfs[0].copy()
    
    for idx, df in enumerate(dfs[1:], start=1):
        # Rename columns to add a suffix except for 'filename'
        renamed_df = df.rename(columns={col: f"{col}_exp{idx}" for col in df.columns if col != 'FileName'})
        
        # Merge with the existing result
        result = pd.merge(result, renamed_df, on='FileName', how='outer')

    # Get all column names except 'filename'
    column_names = [col for col in result.columns if col != 'FileName']

    return result, column_names

In [2]:
import pandas as pd
import numpy as np

In [3]:
experiments_list =[
    # 'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4',
    # 'multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4',
    # 'multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4',
    # 'multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4',
    # 'multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4',
    # "multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4_head1",
    "multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4_head1",
    # "multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4_head1",
    "multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4_1head",
    "multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_head1",
    "multimodalmodel_whisperlarge_debertaxxlarge_lazyload_lr1e5_lasthidden_head1",
    "multimodalmodel_whisperlarge_hubertxlarge_lazyload_lr1e4_head1",
    # "multimodalmodel_whisperlarge_robertalarge_ns3_lazyload_lr1e5_focaloss",
    "multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4_1head_prodosyembs",
    "multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4_head1_rankingloss",
    # "multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4_head1_rankingloss",
    # "multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4_head1_rankingloss",
    "multimodalmodel_whisperlarge_debertaxxlarge_lazyload_lr1e5_lasthidden_head1_rankingloss",
    "multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4_1head_rankingloss",
    "multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_head1_softlossneutral",
    "multimodalmodel_whisperlarge_hubertxlarge_lazyload_lr1e4_head1_rankingloss",
    "multimodalmodel_whisperlarge_robertalarge_lazyload_lr1e4_1head_prodosyembs_rankingloss"
]


dfs = []

for exp in experiments_list:
    
    dfs.append(pd.read_csv(f'../experiments/{exp}/results/test.csv'))
# preds.head()

In [4]:
preds, cols = concatenate_dataframes(dfs)

In [5]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_6_prob_exp10,class_7_prob_exp10,class_0_prob_exp11,class_1_prob_exp11,class_2_prob_exp11,class_3_prob_exp11,class_4_prob_exp11,class_5_prob_exp11,class_6_prob_exp11,class_7_prob_exp11
0,MSP-PODCAST_test3_0001.wav,-2.0672,2.6584,-0.0695,-0.7801,-0.4573,-1.0004,-0.5748,0.6798,-2.3014,...,-0.1512,1.6817,-1.5058,3.75,-1.3535,-1.61,-0.9282,0.0093,-0.5894,1.4406
1,MSP-PODCAST_test3_0002.wav,0.0798,-1.5765,0.0669,-0.6963,-1.4104,-0.0746,0.9615,0.6897,0.6539,...,1.9355,0.6242,0.2399,-1.6866,0.0768,0.8474,-2.076,0.4911,1.6375,1.3953
2,MSP-PODCAST_test3_0003.wav,-3.5373,4.7376,-1.4658,-1.8406,0.1692,-0.786,-1.1933,1.9258,-2.8947,...,-0.7344,2.4194,-3.5061,5.4702,-0.2628,-1.5607,-1.5349,-1.8233,-0.925,4.0346
3,MSP-PODCAST_test3_0004.wav,-0.4396,1.0181,-1.1523,-1.1935,-0.6969,-0.1188,0.8206,0.5401,0.5394,...,0.9972,-0.098,0.1292,2.6163,-2.0621,-0.8646,-2.4348,0.2442,1.7038,0.0062
4,MSP-PODCAST_test3_0005.wav,-2.7698,0.2867,0.5182,0.6662,0.3751,-1.4876,-0.3775,1.2348,-2.7211,...,-0.495,2.5355,-2.8442,1.5173,0.9168,0.7238,-0.781,-1.4479,0.1484,2.4064


In [6]:
preds.isna().sum()

FileName              0
class_0_prob          0
class_1_prob          0
class_2_prob          0
class_3_prob          0
                     ..
class_3_prob_exp11    0
class_4_prob_exp11    0
class_5_prob_exp11    0
class_6_prob_exp11    0
class_7_prob_exp11    0
Length: 97, dtype: int64

In [8]:
import joblib
models = []
for i in range(5):
    models.append(joblib.load(f'../submissions/rf_model_stackingv3_{i}.pkl'))

In [9]:
def predict_with_ensemble(val_df, feature_names, target_name, trained_models):
    # Prepare features
    X_val = val_df[feature_names].values
    
    # Aggregate predictions
    predictions = []
    for i, model in enumerate(trained_models):
        # Predic

        fold_pred = model.predict_proba(X_val)
        predictions.append(fold_pred)
    
    # Average probabilities
    avg_probabilities = np.mean(predictions, axis=0)
    
    # Return argmax of averaged probabilities
    return np.argmax(avg_probabilities, axis=1)

In [10]:
predictions = predict_with_ensemble(preds, cols, "target", models)

In [11]:
preds['Prediction'] = predictions

In [12]:
# # preds = get_probability_vote(dfs)
# preds = get_probability_vote_softmax(dfs)
classes_ = ['A', 'S', 'H', 'U', 'F', 'D', 'C', 'N']

map_argmax = dict()
for i, c in enumerate(classes_):
    map_argmax[i] = c

preds["EmoClass"] = preds["Prediction"].map(map_argmax) 

In [13]:
preds.head()

Unnamed: 0,FileName,class_0_prob,class_1_prob,class_2_prob,class_3_prob,class_4_prob,class_5_prob,class_6_prob,class_7_prob,class_0_prob_exp1,...,class_0_prob_exp11,class_1_prob_exp11,class_2_prob_exp11,class_3_prob_exp11,class_4_prob_exp11,class_5_prob_exp11,class_6_prob_exp11,class_7_prob_exp11,Prediction,EmoClass
0,MSP-PODCAST_test3_0001.wav,-2.0672,2.6584,-0.0695,-0.7801,-0.4573,-1.0004,-0.5748,0.6798,-2.3014,...,-1.5058,3.75,-1.3535,-1.61,-0.9282,0.0093,-0.5894,1.4406,1,S
1,MSP-PODCAST_test3_0002.wav,0.0798,-1.5765,0.0669,-0.6963,-1.4104,-0.0746,0.9615,0.6897,0.6539,...,0.2399,-1.6866,0.0768,0.8474,-2.076,0.4911,1.6375,1.3953,6,C
2,MSP-PODCAST_test3_0003.wav,-3.5373,4.7376,-1.4658,-1.8406,0.1692,-0.786,-1.1933,1.9258,-2.8947,...,-3.5061,5.4702,-0.2628,-1.5607,-1.5349,-1.8233,-0.925,4.0346,1,S
3,MSP-PODCAST_test3_0004.wav,-0.4396,1.0181,-1.1523,-1.1935,-0.6969,-0.1188,0.8206,0.5401,0.5394,...,0.1292,2.6163,-2.0621,-0.8646,-2.4348,0.2442,1.7038,0.0062,6,C
4,MSP-PODCAST_test3_0005.wav,-2.7698,0.2867,0.5182,0.6662,0.3751,-1.4876,-0.3775,1.2348,-2.7211,...,-2.8442,1.5173,0.9168,0.7238,-0.781,-1.4479,0.1484,2.4064,7,N


In [14]:
preds["EmoClass"].value_counts(normalize=True)

EmoClass
A    0.228125
S    0.130625
D    0.123125
H    0.116875
U    0.109687
N    0.105625
C    0.096875
F    0.089063
Name: proportion, dtype: float64

In [15]:
preds = preds[['FileName','EmoClass']]

In [16]:
baseline = pd.read_csv("../submissions/baseline_reprod_cat.csv")

In [17]:
baseline["EmoClass"].value_counts(normalize=True)

EmoClass
A    0.314688
S    0.188438
H    0.141563
C    0.125000
N    0.122813
D    0.057813
U    0.040313
F    0.009375
Name: proportion, dtype: float64

In [18]:
baseline.columns, preds.columns

(Index(['FileName', 'EmoClass'], dtype='object'),
 Index(['FileName', 'EmoClass'], dtype='object'))

In [19]:
(baseline.FileName == preds.FileName).sum()

3200

In [20]:
preds = preds.sort_values(by='FileName').reset_index(drop = True)
preds.to_csv("../submissions/bimodal_ensemble_vfinal.csv", index=False)