# Dataset Rename and split into train test dataset  
- [ ] read dataset compiled csv file
- [ ] split train and test 10% ratio per dataset per category
- [ ] rename file name and put under one folder ./dataset/splitted/train & ./dataset/splitted/train

## Load Metadata csv 

In [1]:
import os
import pandas as pd
import librosa

In [2]:
df = pd.read_csv(f"./dataset_info_combined_V4.csv")

In [3]:
print("----check some statistics----")
print("average length of duration in different datasets: \n", df.groupby('dataset_name')['duration'].mean())
print("total length of duration in different datasets: \n", df.groupby('dataset_name')['duration'].sum())
print("total length of duration in different categories: \n", df.groupby('emotional_category')['duration'].sum())
print("total length of duration in different categories: \n", df.groupby('sentiment_value')['duration'].sum())

----check some statistics----
average length of duration in different datasets: 
 dataset_name
CREMA-D    2.543601
RAVDESS    3.700375
SAVEE      3.839292
TESS       2.055146
Name: duration, dtype: float64
total length of duration in different datasets: 
 dataset_name
CREMA-D    18929.48
RAVDESS     5328.54
SAVEE       1842.86
TESS        5754.41
Name: duration, dtype: float64
total length of duration in different categories: 
 emotional_category
Anger         4988.30
Calmness       728.71
Disgust       5505.19
Fear          4768.54
Happiness     4693.73
Neutrality    4234.03
Sadness       5235.92
Surprise      1700.87
Name: duration, dtype: float64
total length of duration in different categories: 
 sentiment_value
-1    20497.95
 0     4962.74
 1     6394.60
Name: duration, dtype: float64


In [4]:
df.head()

Unnamed: 0,dataset_name,file_path,sentiment_value,emotional_category,speaker_id,gender,duration,duration_trimmed
0,CREMA-D,./dataset/CREMA-D/1022_ITS_ANG_XX.wav,-1,Anger,1022,Male,2.44,2.44
1,CREMA-D,./dataset/CREMA-D/1037_ITS_ANG_XX.wav,-1,Anger,1037,Female,3.0,3.0
2,CREMA-D,./dataset/CREMA-D/1060_ITS_NEU_XX.wav,0,Neutrality,1060,Female,2.4,2.4
3,CREMA-D,./dataset/CREMA-D/1075_ITS_NEU_XX.wav,0,Neutrality,1075,Female,2.44,2.44
4,CREMA-D,./dataset/CREMA-D/1073_IOM_DIS_XX.wav,-1,Disgust,1073,Female,2.87,2.87


## Train Test Split

In [5]:
# Train test split on all 4 datasets
import math
from sklearn.model_selection import train_test_split

master_data = df.copy()
master_data['split'] = 'train'

# random split per dataset and per category
for dataset_name, group in master_data.groupby('dataset_name'):
    if dataset_name in ['CREMA-D','RAVDESS']:
        unique_speaker_ids = group['speaker_id'].unique()
        test_size = math.floor(len(unique_speaker_ids) * 0.1)
        train_speaker_ids, test_speaker_ids = train_test_split(unique_speaker_ids, test_size=test_size , random_state=7)
        master_data.loc[master_data['speaker_id'].isin(test_speaker_ids), 'split'] = 'test'
    else:
        for category, data in group.groupby('emotional_category'):
            train_indices, test_indices = train_test_split(data.index, test_size=0.1, random_state=7)
            master_data.loc[test_indices, 'split'] = 'test'


print("train test splitting: \n", master_data.groupby(['split','dataset_name','emotional_category'])['duration'].count())

train test splitting: 
 split  dataset_name  emotional_category
test   CREMA-D       Anger                  125
                     Disgust                125
                     Fear                   125
                     Happiness              125
                     Neutrality             107
                     Sadness                125
       RAVDESS       Anger                   16
                     Calmness                16
                     Disgust                 16
                     Fear                    16
                     Happiness               16
                     Neutrality               8
                     Sadness                 16
                     Surprise                16
       SAVEE         Anger                    6
                     Disgust                  6
                     Fear                     6
                     Happiness                6
                     Neutrality              12
                     Sad

In [6]:
# Define a function to generate the renamed file path after splitting: 
# new_folder_directory + <dataset_name>_<file_name(original_identifier)>_<emotion_category>_<sentiment_label>
def rename_file_path_after_splitting(row):
    origin_name = row['file_path'][10:-4].replace(row['dataset_name']+'/','').replace('/','_')
    prefix      = f"./dataset/splitted/{row['split']}/{row['dataset_name']}_"
    suffix      = f"_{row['emotional_category']}_{row['sentiment_value']}.wav"
    return prefix + origin_name + suffix

master_data['renamed_file_path'] = master_data.apply(rename_file_path_after_splitting, axis=1)

In [7]:
master_data.tail()

Unnamed: 0,dataset_name,file_path,sentiment_value,emotional_category,speaker_id,gender,duration,duration_trimmed,split,renamed_file_path
12157,TESS,./dataset/TESS/OAF_Sad/OAF_tool_sad.wav,-1,Sadness,OAF,Female,2.56,2.56,train,./dataset/splitted/train/TESS_OAF_Sad_OAF_tool...
12158,TESS,./dataset/TESS/OAF_Sad/OAF_goose_sad.wav,-1,Sadness,OAF,Female,2.52,2.52,train,./dataset/splitted/train/TESS_OAF_Sad_OAF_goos...
12159,TESS,./dataset/TESS/OAF_Sad/OAF_met_sad.wav,-1,Sadness,OAF,Female,2.44,2.44,test,./dataset/splitted/test/TESS_OAF_Sad_OAF_met_s...
12160,TESS,./dataset/TESS/OAF_Sad/OAF_pearl_sad.wav,-1,Sadness,OAF,Female,2.4,2.4,train,./dataset/splitted/train/TESS_OAF_Sad_OAF_pear...
12161,TESS,./dataset/TESS/OAF_Sad/OAF_rain_sad.wav,-1,Sadness,OAF,Female,2.26,2.26,train,./dataset/splitted/train/TESS_OAF_Sad_OAF_rain...


In [8]:
master_data.to_csv(f"./dataset_info_combined_V4.csv", index=False)

## Move Files (Caution on Massive operation)

In [9]:
'''move the file as train_test_split been updated => some files are moved from train to test''' 
# import os
# import shutil

# def move_missing_files(row):
#     renamed_file_path = row['renamed_file_path']
#     if not os.path.exists(renamed_file_path):
#         # Determine the source and destination directories based on the current path
#         if 'train' in renamed_file_path:
#             source_dir = row['renamed_file_path'].replace("train", "test")
#             dest_dir = row['renamed_file_path']
#         else:
#             source_dir = row['renamed_file_path'].replace("test", "train")
#             dest_dir = row['renamed_file_path']
#         # Move the file from the source directory to the destination directory
#         shutil.move(source_dir, dest_dir)

# # Apply the function to each row of the DataFrame
# master_data.apply(move_missing_files, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
12157    None
12158    None
12159    None
12160    None
12161    None
Length: 12162, dtype: object

In [17]:
# import shutil
# master_data.apply(lambda row: shutil.copy(row['file_path'], row['renamed_file_path']), axis=1)

0        ./dataset/splitted/train/CREMA-D_1022_ITS_ANG_...
1        ./dataset/splitted/train/CREMA-D_1037_ITS_ANG_...
2        ./dataset/splitted/train/CREMA-D_1060_ITS_NEU_...
3        ./dataset/splitted/train/CREMA-D_1075_ITS_NEU_...
4        ./dataset/splitted/train/CREMA-D_1073_IOM_DIS_...
                               ...                        
12157    ./dataset/splitted/train/TESS_OAF_Sad_OAF_tool...
12158    ./dataset/splitted/train/TESS_OAF_Sad_OAF_goos...
12159    ./dataset/splitted/test/TESS_OAF_Sad_OAF_met_s...
12160    ./dataset/splitted/train/TESS_OAF_Sad_OAF_pear...
12161    ./dataset/splitted/train/TESS_OAF_Sad_OAF_rain...
Length: 12162, dtype: object