## **Data Pre-processing Notebook** ##

### Setting up the environment ###

In [3]:
from google.colab import drive
import re
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import matplotlib.style as ms
from tqdm import tqdm
import pickle
import librosa
import soundfile as sf
import random

**Mounting Google Drive to access Dataset**


In [4]:
#drive.mount('/content/gdrive/', force_remount=True)
%cd /content/drive/MyDrive/SER/IEMOCAP_full_release
%ls

/content/drive/.shortcut-targets-by-id/1N67nrpGp2OxkpPngxjjiYJ-jGZOGMyEv/IEMOCAP_full_release
df_iemocap.csv  [0m[01;34mPreprocessed[0m/  README.txt~  [01;34mSession2[0m/  [01;34mSession4[0m/
[01;34mDocumentation[0m/  README.txt     [01;34mSession1[0m/    [01;34mSession3[0m/  [01;34mSession5[0m/


### **Extract labels from Evaluation file**


/content/drive/.shortcut-targets-by-id/1N67nrpGp2OxkpPngxjjiYJ-jGZOGMyEv/IEMOCAP_full_release

**Extract** labels from Evaluation Files

In [None]:
info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)

start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []

for sess in range(1, 6):
    emo_evaluation_dir = 'Session{}/dialog/EmoEvaluation/'.format(sess)
    evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]
    for file in evaluation_files:
        with open(emo_evaluation_dir + file) as f:
            content = f.read()
        info_lines = re.findall(info_line, content)
        for line in info_lines[1:]:  # the first line is a header
            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\t')
            start_time, end_time = start_end_time[1:-1].split('-')
            val, act, dom = val_act_dom[1:-1].split(',')
            val, act, dom = float(val), float(act), float(dom)
            start_time, end_time = float(start_time), float(end_time)
            start_times.append(start_time)
            end_times.append(end_time)
            wav_file_names.append(wav_file_name)
            emotions.append(emotion)
            vals.append(val)
            acts.append(act)
            doms.append(dom)

In [None]:
df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])

df_iemocap['start_time'] = start_times
df_iemocap['end_time'] = end_times
df_iemocap['wav_file'] = wav_file_names
df_iemocap['emotion'] = emotions
df_iemocap['val'] = vals
df_iemocap['act'] = acts
df_iemocap['dom'] = doms
df_iemocap.tail()
df_iemocap.to_csv('df_iemocap.csv', index=False)

### **Build Audio Vectors**



Now that the labels have been extracted, we'll use the compiled csv (df_iemocap.csv) to split the original wav files into multiple frames

In [None]:
labels_df = pd.read_csv('Preprocessed/df_iemocap.csv')
iemocap_dir = os.getcwd()+'/'

In [None]:
sr = 44100
audio_vectors = {}
for sess in [2]:  # using all 5 sessions
    wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)
    orig_wav_files = os.listdir(wav_file_path)
    for orig_wav_file in tqdm(orig_wav_files):
        try:
            orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)
            orig_wav_file, file_format = orig_wav_file.split('.')
            for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():
                start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row['end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']
                start_frame = math.floor(start_time * sr)
                end_frame = math.floor(end_time * sr)
                truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]
                audio_vectors[truncated_wav_file_name] = truncated_wav_vector
        except:
            print('An exception occured for {}'.format(orig_wav_file))
    with open('Preprocessed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:
        pickle.dump(audio_vectors, f)

## Extract Features 

In [None]:
import sys
columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean', 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
df_features = pd.DataFrame(columns=columns)

emotion_dict = {'ang': 0,
                'hap': 1,
                'exc': 2,
                'sad': 3,
                'fru': 4,
                'fea': 5,
                'sur': 6,
                'neu': 7,
                'xxx': 8,
                'oth': 8}

for sess in tqdm(range(5, 6)):
    labels_df = pd.read_csv('Preprocessed/df_iemocap.csv')
    iemocap_dir = os.getcwd()+'/'
    data_dir = iemocap_dir+'Preprocessed/'
    labels_path = '{}df_iemocap.csv'.format(data_dir)
    audio_vectors_path = '{}audio_vectors_'.format(data_dir)
    audio_vectors = pickle.load(open('{}{}.pkl'.format(audio_vectors_path, sess), 'rb'))
    # Extract the wav file names from CSV File that starts with Ses0{1-5}
    labels_subset = labels_df[labels_df['wav_file'].str.contains('Ses0{}'.format(sess))] 
     
    feature_list = []
    
    def extract_features(row):
        try:
            wav_file_name = row['wav_file']
            label = emotion_dict[row['emotion']]
            wav = audio_vectors[wav_file_name]

            feature_list = [wav_file_name, label]  # wav_file, label
            sig_mean = np.mean(abs(wav))
            feature_list.append(sig_mean)  # sig_mean
            feature_list.append(np.std(wav))  # sig_std
            rmse = librosa.feature.rms(y=wav + 0.0001)[0]
            feature_list.append(np.mean(rmse))  # rmse_mean
            feature_list.append(np.std(rmse))  # rmse_std

            silence = 0
            for e in rmse:
                if e <= 0.4 * np.mean(rmse):
                    silence += 1
            silence /= float(len(rmse))
            feature_list.append(silence)  # silence

            y_harmonic = librosa.effects.hpss(wav)[0]
            feature_list.append(np.mean(y_harmonic) * 1000)  # harmonic (scaled by 1000)

            cl = 0.45 * sig_mean
            center_clipped = []
            for s in wav:
                if s >= cl:
                    center_clipped.append(s - cl)
                elif s <= -cl:
                    center_clipped.append(s + cl)
                elif np.abs(s) < cl:
                    center_clipped.append(0)
            auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
            feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs))  # auto_corr_max (scaled by 1000)
            feature_list.append(np.std(auto_corrs))  # auto_corr_std

            return pd.Series(feature_list, index=columns)
        except:
            return pd.Series([None]*len(columns), index=columns)

    # Making a feature subset by applying the Extract Features function to each record
    features_subset = labels_subset.apply(extract_features, axis=1)
    features_subset = features_subset.dropna()
    df_features = pd.concat([df_features, features_subset], ignore_index=True)
    output_path=data_dir+'/audio_features.csv'
    #Store records in a csv file
    df_features.to_csv(output_path, mode='a', header=False)
    
#df_features.to_csv(data_dir+'/audio_features.csv', index=False)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = r

## Build Speech Data Files

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pd.read_csv('Preprocessed/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()

(11701, 11)


Unnamed: 0.1,Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,0,Ses01F_script03_2_F000,1.0,0.008801,0.013566,0.010886,0.008046,0.245033,-0.011023,0.12753,0.470876
1,1,Ses01F_script03_2_F001,1.0,0.012073,0.022863,0.014738,0.017455,0.458689,-0.012141,0.4142,1.523709
2,2,Ses01F_script03_2_F002,1.0,0.017942,0.031342,0.022245,0.022012,0.40303,-0.006731,0.742588,2.397591
4,4,Ses01F_script03_2_F004,7.0,0.019325,0.034797,0.023926,0.025248,0.336756,-0.006738,0.933833,3.296841
5,5,Ses01F_script03_2_F005,7.0,0.007106,0.012647,0.008764,0.00907,0.511364,-0.008334,0.122458,0.347454


Unnamed: 0.1,Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,0,Ses01F_script03_2_F000,1,0.008801,0.013566,0.010886,0.008046,0.245033,-0.011023,0.12753,0.470876
1,1,Ses01F_script03_2_F001,1,0.012073,0.022863,0.014738,0.017455,0.458689,-0.012141,0.4142,1.523709
2,2,Ses01F_script03_2_F002,1,0.017942,0.031342,0.022245,0.022012,0.40303,-0.006731,0.742588,2.397591
4,4,Ses01F_script03_2_F004,5,0.019325,0.034797,0.023926,0.025248,0.336756,-0.006738,0.933833,3.296841
5,5,Ses01F_script03_2_F005,5,0.007106,0.012647,0.008764,0.00907,0.511364,-0.008334,0.122458,0.347454


In [6]:
scalar = MinMaxScaler()
df[df.columns[3:]] = scalar.fit_transform(df[df.columns[3:]])
df.head()

Unnamed: 0.1,Unnamed: 0,wav_file,label,sig_mean,sig_std,rmse_mean,rmse_std,silence,harmonic,auto_corr_max,auto_corr_std
0,0,Ses01F_script03_2_F000,1,0.028859,0.034118,0.029388,0.043543,0.318727,0.170701,0.001539,0.001662
1,1,Ses01F_script03_2_F001,1,0.040347,0.058862,0.040528,0.095167,0.596641,0.170483,0.005008,0.005383
2,2,Ses01F_script03_2_F002,1,0.060954,0.081425,0.062243,0.120174,0.524242,0.171538,0.008982,0.008472
4,4,Ses01F_script03_2_F004,5,0.06581,0.090619,0.067105,0.13793,0.438036,0.171537,0.011296,0.011651
5,5,Ses01F_script03_2_F005,5,0.022907,0.031672,0.023248,0.049163,0.665157,0.171225,0.001478,0.001226


In [7]:
x_train, x_test = train_test_split(df, test_size=0.20)

x_train.to_csv('Preprocessed/s2e/audio_train.csv', index=False)
x_test.to_csv('Preprocessed/s2e/audio_test.csv', index=False)

print(x_train.shape, x_test.shape)

(9360, 11) (2341, 11)


## Define preprocessing functions for text

In [8]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Build Text Features

In [9]:

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE) #RE for Alphanumeric 

file2transcriptions = {}

for sess in range(1, 6):
    transcripts_path = os.getcwd()+'/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()

        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            print(audio_code)
            transcription = l.split(':')[-1].strip()
            # assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
# save dict
with open('Preprocessed/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Ses03F_script03_1_M014
Ses03F_script03_1_F013
Ses03F_script03_1_M015
Ses03F_script03_1_F014
Ses03F_script03_1_M016
Ses03F_script03_1_F015
Ses03F_script03_1_M017
Ses03F_script03_1_F016
Ses03F_script03_1_M018
Ses03F_script03_1_F017
Ses03F_script03_1_M019
Ses03F_script03_1_M020
Ses03F_script03_1_F018
Ses03F_script03_1_M021
Ses03F_script03_1_F019
Ses03F_script03_1_M022
Ses03F_script03_1_F020
Ses03F_script03_1_M023
Ses03F_script03_1_F021
Ses03F_script03_1_M024
Ses03F_script03_1_F022
Ses03F_script03_1_M025
Ses03F_script03_1_F023
Ses03F_script03_1_M026
Ses03F_script03_1_F024
Ses03F_script03_1_F025
Ses03F_script03_1_M027
Ses03F_script03_1_F026
Ses03F_script03_1_M028
Ses03F_script03_1_F027
Ses03F_script03_1_M029
Ses03F_script03_1_M030
Ses03F_script03_1_F028
Ses03F_script03_1_M031
Ses03M_script02_1_M000
Ses03M_script02_1_M001
Ses03M_script02_1_M002
Ses03M_script02_1_F000
Ses03M_script02_1_M003
Ses03M_script02_1_M004
Ses03M_script02

10087

In [10]:
audiocode2text = pickle.load(open('Preprocessed/t2e/audiocode2text.pkl', 'rb'))
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('Preprocessed/t2e/text_train.csv', index=False)
text_test.to_csv('Preprocessed/t2e/text_test.csv', index=False)

print(text_train.shape, text_test.shape)

(9360, 3) (2341, 3)
