## Build Speech data files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

%matplotlib inline

In [2]:
df = pd.read_csv('data/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()

(7527, 10)


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_script02_2_F000,7,0.003671,0.005739,0.004434,0.00364,0.018692,-0.008143,0.023179,0.133057
1,Ses01F_script02_2_F001,7,0.006365,0.011155,0.007913,0.00785,0.444444,-0.01712,0.094578,0.213759
6,Ses01F_script02_2_F006,0,0.039659,0.067939,0.04993,0.04605,0.345018,-0.004605,3.441704,9.317455
7,Ses01F_script02_2_F007,4,0.014478,0.026941,0.018384,0.019687,0.422764,-0.01185,0.568261,1.928247
8,Ses01F_script02_2_F008,0,0.025271,0.054958,0.031571,0.044958,0.470019,-0.00512,2.529399,9.210082


Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_script02_2_F000,5,0.003671,0.005739,0.004434,0.00364,0.018692,-0.008143,0.023179,0.133057
1,Ses01F_script02_2_F001,5,0.006365,0.011155,0.007913,0.00785,0.444444,-0.01712,0.094578,0.213759
6,Ses01F_script02_2_F006,0,0.039659,0.067939,0.04993,0.04605,0.345018,-0.004605,3.441704,9.317455
7,Ses01F_script02_2_F007,2,0.014478,0.026941,0.018384,0.019687,0.422764,-0.01185,0.568261,1.928247
8,Ses01F_script02_2_F008,0,0.025271,0.054958,0.031571,0.044958,0.470019,-0.00512,2.529399,9.210082


In [3]:
df.to_csv('data/no_sample_df.csv')

# oversample fear
fear_df = df[df['label']==3]
for i in range(30):
    df = df.append(fear_df)

sur_df = df[df['label']==4]
for i in range(10):
    df = df.append(sur_df)
    
df.to_csv('data/modified_df.csv')

In [4]:
emotion_dict = {'ang': 0,
                'hap': 1,
                'sad': 2,
                'neu': 3,}

# emotion_dict = {'ang': 0,
#                 'hap': 1,
#                 'exc': 2,
#                 'sad': 3,
#                 'fru': 4,
#                 'fea': 5,
#                 'sur': 6,
#                 'neu': 7,
#                 'xxx': 8,
#                 'oth': 8}

scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()

Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,Ses01F_script02_2_F000,5,0.010847,0.01329,0.010715,0.019386,0.024313,0.168625,0.000277,0.000468
1,Ses01F_script02_2_F001,5,0.020306,0.027702,0.020774,0.042489,0.578112,0.166868,0.001141,0.000753
6,Ses01F_script02_2_F006,0,0.137206,0.178822,0.142271,0.252096,0.448783,0.169317,0.041644,0.032933
7,Ses01F_script02_2_F007,2,0.048793,0.069713,0.051051,0.107439,0.549911,0.167899,0.006873,0.006814
8,Ses01F_script02_2_F008,0,0.086686,0.144276,0.089184,0.2461,0.611379,0.169216,0.030604,0.032553


In [5]:
x_train, x_test = train_test_split(df, test_size=0.20)

x_train.to_csv('data/s2e/audio_train.csv', index=False)
x_test.to_csv('data/s2e/audio_test.csv', index=False)

print(x_train.shape, x_test.shape)

(7837, 10) (1960, 10)


## Define preprocessing functions for text

In [6]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Build Text data files

In [7]:
import re
import os
import pickle

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)

file2transcriptions = {}

for sess in range(1, 6):
    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()

        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            transcription = l.split(':')[-1].strip()
            # assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
# save dict
with open('data/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions)

10087

In [8]:
audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))

In [9]:
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('data/t2e/text_train.csv', index=False)
text_test.to_csv('data/t2e/text_test.csv', index=False)

print(text_train.shape, text_test.shape)

(7837, 3) (1960, 3)
