In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('fer2013.csv')

# Check class distribution
print(df['emotion'].value_counts())


emotion
3    8989
6    6198
4    6077
2    5121
0    4953
5    4002
1     547
Name: count, dtype: int64


In [2]:
TARGET_COUNT = 5000


In [3]:
import numpy as np

# Store balanced dataframes
balanced_data = []

# Loop through all emotion labels
for emotion_label in df['emotion'].unique():
    class_subset = df[df['emotion'] == emotion_label]
    current_count = len(class_subset)
    
    if current_count > TARGET_COUNT:
        # Downsample
        balanced = class_subset.sample(n=TARGET_COUNT, random_state=42)
    elif current_count < TARGET_COUNT:
        # Upsample with replacement
        balanced = class_subset.sample(n=TARGET_COUNT, replace=True, random_state=42)
    else:
        balanced = class_subset
    
    balanced_data.append(balanced)

# Concatenate all balanced class data
balanced_df = pd.concat(balanced_data).sample(frac=1, random_state=42).reset_index(drop=True)

# Check new distribution
print(balanced_df['emotion'].value_counts())


emotion
6    5000
2    5000
4    5000
5    5000
0    5000
3    5000
1    5000
Name: count, dtype: int64


In [4]:
balanced_df.to_csv('fer2013_balanced_5000.csv', index=False)
