In [5]:
from collections import Counter

def get_majority_vote(dataframes_list):
    """
    Takes a list of dataframes with 'FileName' and 'Prediction' columns and returns
    a new dataframe with majority vote predictions for each filename.
    
    Args:
        dataframes_list (list): List of pandas dataframes, each containing 'FileName' and 'Prediction' columns
        
    Returns:
        pandas.DataFrame: A dataframe with majority vote predictions
    """
    # Concatenate all dataframes
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    # Function to get majority vote for a group
    def majority_vote(group):
        predictions = group['EmoClass'].tolist()
        # Get the most common prediction
        most_common = Counter(predictions).most_common(1)[0][0]
        return most_common
    
    # Group by FileName and apply majority vote
    result_df = (combined_df.groupby('FileName')
                          .apply(majority_vote)
                          .reset_index())
    
    # Rename the columns
    result_df.columns = ['FileName', 'EmoClass']
    
    # Sort by FileName for consistency
    result_df = result_df.sort_values('FileName').reset_index(drop=True)
    
    return result_df

def get_probability_vote(dataframes_list):
    """
    Takes a list of dataframes with 'FileName' and 'class_N_prob' columns
    and returns a new dataframe with FileName and Prediction based on averaged probabilities.
    
    Args:
        dataframes_list (list): List of pandas dataframes containing 'FileName' 
                               and probability columns ('class_0_prob', 'class_1_prob', etc.)
        
    Returns:
        pandas.DataFrame: A dataframe with FileName and Prediction columns
    """
    # Concatenate all dataframes
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    # Find probability columns (class_N_prob)
    prob_columns = [col for col in combined_df.columns if col.startswith('class_') and col.endswith('_prob')]
    
    # Function to get prediction from probabilities
    def get_prediction(group):
        mean_probs = [group[col].mean() for col in prob_columns]
        return np.argmax(mean_probs)
    
    # Group by FileName and get predictions
    result_df = pd.DataFrame({
        'FileName': combined_df['FileName'].unique()
    })
    
    result_df['EmoClass'] = (combined_df.groupby('FileName')
                                        .apply(get_prediction)
                                        .values)
    
    # Sort by FileName for consistency
    result_df = result_df.sort_values('FileName').reset_index(drop=True)
    
    return result_df

def softmax(x):
    """
    Compute softmax values for each set of scores in x.
    """
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / exp_x.sum()

def get_probability_vote_softmax(dataframes_list):
    """
    Takes a list of dataframes with 'FileName' and 'class_N_prob' columns,
    applies softmax to probabilities before averaging, and returns predictions.
    
    Args:
        dataframes_list (list): List of pandas dataframes containing 'FileName' 
                               and probability columns ('class_0_prob', 'class_1_prob', etc.)
        
    Returns:
        pandas.DataFrame: A dataframe with FileName and Prediction columns
    """
    # Concatenate all dataframes
    combined_df = pd.concat(dataframes_list, ignore_index=True)
    
    # Find probability columns (class_N_prob)
    prob_columns = [col for col in combined_df.columns if col.startswith('class_') and col.endswith('_prob')]
    
    # Function to get prediction from probabilities with softmax
    def get_softmax_prediction(group):
        # Get raw probabilities for each prediction
        raw_probs = np.array([group[col].values for col in prob_columns]).T
        
        # Apply softmax to each prediction
        softmax_probs = np.array([softmax(pred) for pred in raw_probs])
        
        # Average the softmaxed probabilities
        mean_probs = softmax_probs.mean(axis=0)
        
        return np.argmax(mean_probs)
    
    # Group by FileName and get predictions
    result_df = pd.DataFrame({
        'FileName': combined_df['FileName'].unique()
    })
    
    result_df['EmoClass'] = (combined_df.groupby('FileName')
                                        .apply(get_softmax_prediction)
                                        .values)
    
    # Sort by FileName for consistency
    result_df = result_df.sort_values('FileName').reset_index(drop=True)
    
    return result_df

In [6]:
import pandas as pd
import numpy as np

In [7]:
experiments_list = [
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4',
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_focalloss',
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_batchbalanced',
    'multimodalmodel_wavlmlarge_robertalarge_lazyload_lr1e4_batchbalanced_focaloss',
    'multimodalmodel_w2v2robust_robertalarge_lazyload_lr1e4',
    'multimodalmodel_hubertxlarge_robertalarge_lazyload_lr1e4',
    'multimodalmodel_w2v2xls2b_robertalarge_lazyload_lr1e4'
]

dfs = []

for exp in experiments_list:
    
    dfs.append(pd.read_csv(f'../experiments/{exp}/results/test.csv'))
# preds.head()

In [8]:
preds = get_majority_vote(dfs)

In [9]:
# # preds = get_probability_vote(dfs)
# preds = get_probability_vote_softmax(dfs)
# classes_ = ['A', 'S', 'H', 'U', 'F', 'D', 'C', 'N']

# map_argmax = dict()
# for i, c in enumerate(classes_):
#     map_argmax[i] = c

# preds["Prediction"] = preds["Prediction"].map(map_argmax) 

In [10]:
preds.head()

Unnamed: 0,FileName,EmoClass
0,MSP-PODCAST_test3_0001.wav,S
1,MSP-PODCAST_test3_0002.wav,A
2,MSP-PODCAST_test3_0003.wav,S
3,MSP-PODCAST_test3_0004.wav,S
4,MSP-PODCAST_test3_0005.wav,N


In [11]:
preds["EmoClass"].value_counts()

EmoClass
A    1262
H     609
S     403
N     262
D     229
C     185
U     172
F      78
Name: count, dtype: int64

In [12]:
baseline = pd.read_csv("../submissions/baseline_reprod_cat.csv")

In [16]:
baseline["EmoClass"].value_counts()

EmoClass
A    1007
S     603
H     453
C     400
N     393
D     185
U     129
F      30
Name: count, dtype: int64

In [13]:
baseline.columns, preds.columns

(Index(['FileName', 'EmoClass'], dtype='object'),
 Index(['FileName', 'EmoClass'], dtype='object'))

In [15]:
(baseline.FileName == preds.FileName).sum()

3200

In [17]:
preds = preds.sort_values(by='FileName').reset_index(drop = True)
preds.to_csv("../submissions/bimodal_ensemble7.csv", index=False)