## Build Speech data files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

%matplotlib inline

In [2]:
df = pd.read_csv('data/pre-processed/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()

(1402, 10)


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro01_F000,7,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
1,Ses01F_impro01_F001,7,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
2,Ses01F_impro01_F002,7,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
5,Ses01F_impro01_F005,7,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
6,Ses01F_impro01_F006,4,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro01_F000,5,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
1,Ses01F_impro01_F001,5,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
2,Ses01F_impro01_F002,5,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
5,Ses01F_impro01_F005,5,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779
6,Ses01F_impro01_F006,2,0.087017,0.143311,0.10739,0.094826,0.337838,-0.103407,14.801009,42.235779


In [13]:
df.to_csv('data/no_sample_df.csv')

# oversample fear
fear_df = df[df['label']==3]
# for i in range(30):
#     df = df.append(fear_df)

sur_df = df[df['label']==4]
# for i in range(10):
#     df = df.append(sur_df)
fear_df = df[df['label'] == 3]
sur_df = df[df['label'] == 4]
# Buat list DataFrame untuk digabungkan
dfs_to_concat = [df]  # mulai dari data asli

# Tambahkan fear_df sebanyak 30 kali (oversampling)
dfs_to_concat.extend([fear_df] * 30)

# Tambahkan sur_df sebanyak 10 kali (oversampling)
dfs_to_concat.extend([sur_df] * 10)

# Gabungkan semua DataFrame sekaligus secara efisien
df_oversampled = pd.concat(dfs_to_concat, ignore_index=True)

df.to_csv('data/modified_df.csv')

In [6]:
emotion_dict = {'ang': 0,
                'hap': 1,
                'sad': 2,
                'neu': 3,}

# emotion_dict = {'ang': 0,
#                 'hap': 1,
#                 'exc': 2,
#                 'sad': 3,
#                 'fru': 4,
#                 'fea': 5,
#                 'sur': 6,
#                 'neu': 7,
#                 'xxx': 8,
#                 'oth': 8}

scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()

Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_impro01_F000,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ses01F_impro01_F001,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ses01F_impro01_F002,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Ses01F_impro01_F005,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Ses01F_impro01_F006,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
x_train, x_test = train_test_split(df, test_size=0.20)

x_train.to_csv('data/s2e/audio_train.csv', index=False)
x_test.to_csv('data/s2e/audio_test.csv', index=False)

print(x_train.shape, x_test.shape)

(1121, 10) (281, 10)


## Define preprocessing functions for text

In [16]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Build Text data files

In [18]:
import re
import os
import pickle

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)

file2transcriptions = {}

for sess in range(1, 6):
    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()

        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            transcription = l.split(':')[-1].strip()
            # assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
# save dict
with open('data/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions)

10087

In [19]:
audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))

In [20]:
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('data/t2e/text_train.csv', index=False)
text_test.to_csv('data/t2e/text_test.csv', index=False)

print(text_train.shape, text_test.shape)

(1121, 3) (281, 3)
