# Open Set Emotion Recognition

## Library Imports

In [15]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%matplotlib inline
from collections import defaultdict
import torch.nn.functional as F
import torch.nn as nn
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
tqdm.pandas()
import librosa
import re
from collections import Counter
import torch
from torchvision import transforms
import numpy as np
import matplotlib.pyplot as plt
from torchvision import models, transforms
from PIL import Image

## Dataset Creation

### MELD

In [26]:
class MELDDataset(Dataset):
    def __init__(self, meld_dir, split, transform=None):
        self.meld_dir = meld_dir
        self.transform = transform
        self.split = split
        self.audio_dir = os.path.join(self.meld_dir, f'{self.split}_audio')
        self.img_path = os.path.join(self.meld_dir, 'mel_spectograms', f'{self.split}_img')
        self.img_path = os.path.join(self.meld_dir, 'log_spectrogram', f'{self.split}_audio')
        self.audio_files = self.load_audio_files()
        self.dialogues = self.load_dialogues()
        self.spectograms = self.load_spectograms()


    def load_audio_files(self):
        audio_files = os.listdir(self.audio_dir)
        return audio_files

    def load_dialogues(self):
        dialogue_file = os.path.join(self.meld_dir, f'{self.split}_sent_emo.csv')
        dialogues = pd.read_csv(dialogue_file)
        return dialogues

    def load_spectograms(self):
        images = os.listdir(self.img_path)
        return images

    def __len__(self):
        return len(self.dialogues)

    def preprocess_img(self, img):
        preprocesser = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor()
        ])
        img_t =  preprocesser(img)
        return img_t

    def __getitem__(self, idx):
        row = self.dialogues.iloc[idx]
        text = row['Utterance']
        audio_data = librosa.load(os.path.join(self.audio_dir, f'dia{row["Dialogue_ID"]}_utt{row["Utterance_ID"]}.wav'))
        spectogram_data = Image.open(os.path.join(self.img_path, f'dia{row["Dialogue_ID"]}_utt{row["Utterance_ID"]}.png'))
        spectogram_data = self.preprocess_img(spectogram_data)
        label = row['Emotion']
        if self.transform:
            audio_data[0] = self.transform(audio_data[0])
        return text, audio_data, spectogram_data, label

train_meld = MELDDataset("../MELD_Dataset", "train")
# test_meld = MELDDataset("../MELD_Dataset", "test")
# dev_meld = MELDDataset("../MELD_Dataset", "dev")

# concat all 3 datasets into 1 dataset
meld_dataset = train_meld # + test_meld + dev_meld

In [27]:
meld_dataset[0]

('also I was the point person on my company\x92s transition from the KL-5 to GR-6 system.',
 (array([-0.00198962, -0.02142129, -0.02587057, ..., -0.06124197,
         -0.06868309, -0.04373461], dtype=float32),
  22050),
 tensor([[[0.9529, 0.9333, 0.9529,  ..., 1.0000, 1.0000, 1.0000],
          [0.7490, 0.7020, 0.8235,  ..., 1.0000, 1.0000, 1.0000],
          [0.4000, 0.5490, 0.8549,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.8275, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [0.4000, 0.9961, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [0.3725, 0.8510, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.9529, 0.9333, 0.9529,  ..., 1.0000, 1.0000, 1.0000],
          [0.7490, 0.7020, 0.8235,  ..., 1.0000, 1.0000, 1.0000],
          [0.4000, 0.5490, 0.8549,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.8275, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [0.4000, 0.9961, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [0.3725, 0.

#### IEMOCAP

In [28]:
class IemocapDataset(Dataset):
    def __init__(self, iemocap_dataset_full_path, transform=None):
        self.IEMOCAP_MAIN_FOLDER = os.path.join(iemocap_dataset_full_path, "IEMOCAP_full_release")
        self.TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
        self.AUDIO_FOLDER = os.path.join("sentences", "wav")
        self.CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
        self.transform = transform

        self.errors = defaultdict(int)
        self.dataset = self.create_dataset()
        self.print_summary()

    def get_evaluator_filenames_with_video_file_prefix(self, input_list, prefix_value):
        regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
        matching_strings = [s for s in input_list if regex_pattern.match(s)]
        return matching_strings

    def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(self, evaluation_files):
        utterance_to_all_evaluations = {}

        for evaluation_file in evaluation_files:
            utterance_to_evaluationList = {}
            with open(evaluation_file,'r') as f:
                contents = f.read()
                utterance_evaluations = contents.split("\n")
                for evaluation in utterance_evaluations:
                    evaluation = evaluation.strip()
                    if len(evaluation) == 0:
                        continue
                    matches = re.findall(r':[^;]+;', evaluation)
                    matches = [match[1:-1] for match in matches]
                    utterance_to_evaluationList[evaluation.split()[0]] = matches

            # Combine lists from dict1
            for key, value_list in utterance_to_evaluationList.items():
                utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

        utterance_to_evaluationsCounter = {k:Counter(v).most_common(1)[0][0] for k,v in utterance_to_all_evaluations.items()}
        return utterance_to_evaluationsCounter

    def create_dataset(self):
        dataset = []
        for session_num in range(1,6):
            for transcription_filename in os.listdir(os.path.join(self.IEMOCAP_MAIN_FOLDER,f"Session{session_num}", self.TRANSCRIPTION_FOLDER)):
                if transcription_filename[0] != ".":
                    filename_without_extension = transcription_filename.split(".")[0]

                    categorical_labels_folder_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH)
                    evaluation_filenames = self.get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                    evaluation_files_full_paths_for_this_file = [os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                    evaluations_per_utterance = self.get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)

                    transcription_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.TRANSCRIPTION_FOLDER, transcription_filename)
                    with open(transcription_file_full_path,'r') as f:
                        contents = f.read()
                        lines = contents.split("\n")

                        # Iterate through utterances where every utterance looks like:
                        # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                        for line in lines:

                            # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                            line = line.strip()
                            if(len(line)==0):
                                break

                            # Remove idx of first space, ], -
                            try:
                                space_idx = line.index(" ")
                                timestampEndBracket_idx = line.index("]")
                                timestampHyphen_idx = line.index("-")
                            except:
                                self.errors["Problematic Transcription Line"]+=1
                                continue
                            else:
                                audio_filename = line[:space_idx]        # output audio file name = utterance name
                                text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                                evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")
                                if(evaluation=="KEY_ERROR"):
                                    self.errors["Unavailable Label for an utterance"]+=1

                                utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                                audio_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                                if evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True:
                                    dataset.append((text,audio_file_full_path,evaluation))
        return dataset

    def print_summary(self):
        print("SUMMARY:\n")
        for k,v in self.errors.items():
            print(f"{k}: {v}")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text, audio, label = self.dataset[idx]

        if self.transform:
            audio[0] = self.transform(audio[0])
        return text, librosa.load(audio), label

iemocap_dataset = IemocapDataset("../IEMOCAP_Dataset")

SUMMARY:

Problematic Transcription Line: 152
Unavailable Label for an utterance: 48


In [7]:
iemocap_dataset[0]

('Excuse me.',
 (array([-0.00476289, -0.0055054 , -0.00418305, ..., -0.00345229,
         -0.0044057 , -0.00205744], dtype=float32), 22050),
 'Neutral state')

## Preprocessing

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True,
                                  )
model.eval()

def encode_sentence(sentence):
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attention masks.
                        return_tensors = 'pt',
                   )

    with torch.no_grad():
        outputs = model(encoded_dict['input_ids'], encoded_dict['attention_mask'])
        hidden_states = outputs[2]

    token_vecs_cat = torch.stack(hidden_states[-4:], dim=0)
    token_vecs_cat = torch.mean(token_vecs_cat, 0)
    sentence_embedding = torch.mean(token_vecs_cat, 1)

    return torch.from_numpy(sentence_embedding[0].numpy())

def preprocess_text(text):
    # apostrophe ' is not rendered properly so replacing special character with apostrophe
    text = text.replace("\x92", "'")
    return encode_sentence(text)

In [30]:
def extract_audio_features_from_spectogram(img):
    model = models.resnet50(pretrained=True)
    feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
    feature_extractor.eval()

    # Pass the input through the model
    with torch.no_grad():
        output = feature_extractor(img)
    return output

## Model

In [31]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

class AudioTextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioTextEmotionModel, self).__init__()
        ## sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(2048 + 768, 1024),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(1024, num_classes)
        )


    def forward(self, audio, text):
        audio_out = extract_audio_features_from_spectogram(audio)
        text_out = preprocess_text(text)
        combined = torch.cat([audio_out, text_out], dim=1)
        return self.fc(combined)


model = AudioTextEmotionModel(7)
model.to(device)


AudioTextEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=2816, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=1024, out_features=7, bias=True)
  )
)

## Loss Function and Optimizer
One final step before we can simply call `model.fit`

In [32]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

## Train!

In [33]:
train_df = pd.read_csv("../MELD_Dataset/train_sent_emo.csv")
labels = train_df['Emotion'].unique().tolist()
label_to_int = {label: i for i, label in enumerate(labels)}

In [35]:
# Create data loaders.
batch_size = 64
train_dataloader = DataLoader(meld_dataset, batch_size=batch_size)
test_dataloader = DataLoader(iemocap_dataset, batch_size=batch_size)


def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (text, audio_data, spectogram_data, label) in enumerate(dataloader):
        print (text, audio_data, spectogram_data, label)
        label = torch.tensor(label_to_int[label])
        text, spectogram_data, label = text.to(device), spectogram_data.to(device), label.to(device)

        # Compute prediction error
        pred = model(spectogram_data, text)
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

train(train_dataloader, model, loss_fn, optimizer)


RuntimeError: stack expects each tensor to be equal size, but got [125465] at entry 0 and [33075] at entry 1

## Evaluation

In [48]:
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for text, audio, label in dataloader:
            audio = audio.to(device)
            text = text.to(device)
            label = label.to(device)
            outputs = model(audio, text)
            _, predicted = torch.max(outputs.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()
    return correct / total

In [None]:
# learn pytorch basic with some basic models and datasets
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/nnqs_tutorial.html