In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
import noisereduce
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
lab_df = pd.read_csv('Summary_Emo_Eval.csv')

lab_dic = {}
for ind, row in lab_df.iterrows():
    label = row['emotion']
    if label == 'xxx':
        lab_dic[row['wav_filename']] = 'oth' # other
    elif label == 'exc': # excited
        lab_dic[row['wav_filename']] = 'hap' # happy
    else:
        lab_dic[row['wav_filename']] = label

In [None]:
dataset_dir = 'IEMOCAP/'

Y = []
mfcc_list = []

for session in [1]:#range(1,6)
    wav_sess_dir = dataset_dir + f'Session{session}/sentences/wav/'
    dialog_names = os.listdir(wav_sess_dir)
    for dialog in dialog_names:
        if dialog.startswith('.'):
            continue
        wav_dialog_dir = wav_sess_dir+dialog+'/'
        sentences_names = os.listdir(wav_dialog_dir)
        for sentence in sentences_names:
            if sentence.startswith('.'):
                continue
            if not sentence.endswith('wav'):
                continue
            label = [lab_dic[sentence[:-4]]]
            if not label[0] in ['ang','hap','sad','neu']:
                continue
            wav_sentence_path = wav_dialog_dir+sentence
            waveform, sr = librosa.load(wav_sentence_path, sr=None)
            mfcc = np.mean(librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=40).T, axis=0)
            Y.append(label)
            mfcc_list.append(mfcc.tolist())

In [None]:
X_all = np.array(X)
enc = OneHotEncoder()
enc.fit([['ang'],['hap'],['sad'],['neu']])
Y_all = enc.transform(Y).toarray()

Xtrain, Xvaltest, Ytrain, Yvaltest = train_test_split(X_all, Y_all, test_size=0.2, random_state=42, shuffle=True)
Xval, Xtest, Yval, Ytest = train_test_split(Xvaltest, Yvaltest, test_size=0.5, random_state=42, shuffle=True)

Xtrain_c = np.expand_dims(Xtrain, axis=1)
Xval_c = np.expand_dims(Xval, axis=1)
Xtest_c = np.expand_dims(Xtest, axis=1)

In [None]:
class NNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 256, 5, padding=2)
        self.bn1 = nn.BatchNorm1d(256)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(256, 128, 5, padding=2)
        self.bn2 = nn.BatchNorm1d(128)
        self.drop = nn.Dropout(0.1)
        self.pool2 = nn.MaxPool1d(2)
        self.conv3 = nn.Conv1d(128, 128, 5, padding=2)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool3 = nn.MaxPool1d(2)
        self.conv4 = nn.Conv1d(128, 128, 5, padding=2)
        self.conv5 = nn.Conv1d(128, 128, 5, padding=2)
        self.bn4 = nn.BatchNorm1d(128)
        self.conv6 = nn.Conv1d(128, 128, 5, padding=2)
        self.fc = nn.Linear(640, 4)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.drop(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(x)
        x = self.conv5(x)
        x = self.bn4(x)
        x = F.relu(x)
        x = self.conv6(x)
        x = F.relu(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.softmax(x)
        return x

net = NNModel()

In [None]:
Xtrain_ts  = torch.Tensor(Xtrain_c)
Ytrain_ts  = torch.Tensor(Ytrain)

Xval_ts = torch.Tensor(Xval_c)
Yval_ts = torch.Tensor(Yval)

Xtest_ts = torch.Tensor(Xtest_c)
Ytest_ts = torch.Tensor(Ytest)

batch_size = 4

train_set = torch.utils.data.TensorDataset(Xtrain_ts,Ytrain_ts)
train_loader = torch.utils.data.DataLoader(train_set,batch_size=batch_size,shuffle=False,num_workers=2)

val_set = torch.utils.data.TensorDataset(Xval_ts,Yval_ts)
val_loader = torch.utils.data.DataLoader(val_set,batch_size=batch_size,shuffle=False,num_workers=2)

test_set = torch.utils.data.TensorDataset(Xtest_ts,Ytest_ts)
test_loader = torch.utils.data.DataLoader(test_set,batch_size=batch_size,shuffle=False,num_workers=2)

In [None]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.00001)
criterion = nn.CrossEntropyLoss()

epochs = 1000

for epoch in range(1, epochs+1):
    train_loss = 0.0
    val_loss = 0.0
    train_acc = 0.0
    val_acc = 0.0
    test_acc = 0.0
    for batch, (inputs,targets) in enumerate(train_loader, 0):
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        preds = torch.max(outputs, 1)[1]
        classes = torch.max(targets, 1)[1]
        train_correct = (preds == classes).sum()
        train_acc += train_correct.item()
        train_loss += loss.item()
    if epoch % 10 == 0 or epoch == 1:
        with torch.no_grad():
            net.eval()
            for inputs,targets in val_loader:
                outputs = net(inputs)
                preds = torch.max(outputs, 1)[1]
                classes = torch.max(targets, 1)[1]
                val_correct = (preds == classes).sum()
                val_acc += val_correct.item()
                loss = criterion(outputs, targets)
                val_loss += loss.item()
            for inputs,targets in test_loader:
                outputs = net(inputs)
                preds = torch.max(outputs, 1)[1]
                classes = torch.max(targets, 1)[1]
                test_correct = (preds == classes).sum()
                test_acc += test_correct.item()
            net.train()
        Train_loss = train_loss/(len(train_set))
        Train_acc = 100*train_acc/(len(train_set))
        Val_acc = 100*val_acc/(len(val_set))
        Test_acc = 100*test_acc/(len(test_set))
        print(f'Epoch {epoch:5d}:')
        print(f'Training Loss {Train_loss:.3f}; Training Acc {Train_acc:.3f}%; Validation Acc {Val_acc:.3f}%; Test Acc {Test_acc:.3f}%')        
print('Finished training')

# after training, choose epoch based on validation accuracy
# to do:
# plot the loss/accuracy
# get confusion matrix
