# Open Set Emotion Recognition

## Library Imports

In [15]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%matplotlib inline
from collections import defaultdict
import torch.nn as nn
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
tqdm.pandas()
import librosa
import re
from collections import Counter
import torch
from torchvision import models, transforms
from PIL import Image
from sentence_transformers import SentenceTransformer

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: cuda


## Dataset Creation

### MELD

In [16]:
class MELDDataset(Dataset):
    def __init__(self, meld_dir, split, transform=None):
        train_df = pd.read_csv("../MELD_Dataset/train_sent_emo.csv")
        labels = train_df['Emotion'].unique().tolist()
        self.label_to_int = {label: i for i, label in enumerate(labels)}

        self.meld_dir = meld_dir
        self.transform = transform
        self.split = split
        self.img_path = os.path.join(self.meld_dir, 'mel_spectrograms', f'{self.split}_img')
        self.img_path = os.path.join(self.meld_dir, 'log_spectrogram', f'{self.split}_audio')

        # load and create sentence embeddings
        self.dialogues = self.load_dialogues()
        self.sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
        sentences = self.dialogues['Utterance'].tolist()
        sentences = [text.replace("\x92", "'") for text in sentences]
        self.sentence_embeddings = self.sbert.encode(sentences, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

        self.spectrograms = self.load_spectrograms()
        self.resnet_model = models.resnet50(pretrained=True)
        self.feature_extractor = torch.nn.Sequential(*list(self.resnet_model.children())[:-1]).to(device)
        self.feature_extractor.eval()

    def load_dialogues(self):
        dialogue_file = os.path.join(self.meld_dir, f'{self.split}_sent_emo.csv')
        dialogues = pd.read_csv(dialogue_file)
        return dialogues

    def load_spectrograms(self):
        images = os.listdir(self.img_path)
        return images

    def __len__(self):
        assert(len(self.sentence_embeddings) == len(self.spectrograms))
        return len(self.dialogues)

    def preprocess_img(self, img):
        preprocessor = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
        ])
        img_t =  preprocessor(img).to(device)
        return img_t

    def extract_audio_features_from_spectrogram(self, img):
        # Pass the input through the model
        with torch.no_grad():
            output = self.feature_extractor(img)
        return output

    def __getitem__(self, idx):
        row = self.dialogues.iloc[idx]
        text = self.sentence_embeddings[idx]
        spectrogram_data = Image.open(os.path.join(self.img_path, f'dia{row["Dialogue_ID"]}_utt{row["Utterance_ID"]}.png'))
        spectrogram_data = self.preprocess_img(spectrogram_data)
        spectrogram_data = spectrogram_data[0:3, :, :]
        spectrogram_data = spectrogram_data.unsqueeze(0)
        spectrogram_data = self.extract_audio_features_from_spectrogram(spectrogram_data)
        spectrogram_data = spectrogram_data.view(-1, 2048)[0]
        label = row['Emotion']
        label = torch.tensor(self.label_to_int[label])
        return text, spectrogram_data, label

train_meld = MELDDataset("../MELD_Dataset", "train")
# test_meld = MELDDataset("../MELD_Dataset", "test")
# dev_meld = MELDDataset("../MELD_Dataset", "dev")

# concat all 3 datasets into 1 dataset
meld_dataset = train_meld # + test_meld + dev_meld

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [18]:
len(meld_dataset)

AssertionError: 

#### IEMOCAP

In [None]:
class IemocapDataset(Dataset):
    def __init__(self, iemocap_dataset_full_path, transform=None):
        self.IEMOCAP_MAIN_FOLDER = os.path.join(iemocap_dataset_full_path, "IEMOCAP_full_release")
        self.TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
        self.AUDIO_FOLDER = os.path.join("sentences", "wav")
        self.CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
        self.transform = transform

        self.errors = defaultdict(int)
        self.dataset = self.create_dataset()
        self.print_summary()

    def get_evaluator_filenames_with_video_file_prefix(self, input_list, prefix_value):
        regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
        matching_strings = [s for s in input_list if regex_pattern.match(s)]
        return matching_strings

    def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(self, evaluation_files):
        utterance_to_all_evaluations = {}

        for evaluation_file in evaluation_files:
            utterance_to_evaluationList = {}
            with open(evaluation_file,'r') as f:
                contents = f.read()
                utterance_evaluations = contents.split("\n")
                for evaluation in utterance_evaluations:
                    evaluation = evaluation.strip()
                    if len(evaluation) == 0:
                        continue
                    matches = re.findall(r':[^;]+;', evaluation)
                    matches = [match[1:-1] for match in matches]
                    utterance_to_evaluationList[evaluation.split()[0]] = matches

            # Combine lists from dict1
            for key, value_list in utterance_to_evaluationList.items():
                utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

        utterance_to_evaluationsCounter = {k:Counter(v).most_common(1)[0][0] for k,v in utterance_to_all_evaluations.items()}
        return utterance_to_evaluationsCounter

    def create_dataset(self):
        dataset = []
        for session_num in range(1,6):
            for transcription_filename in os.listdir(os.path.join(self.IEMOCAP_MAIN_FOLDER,f"Session{session_num}", self.TRANSCRIPTION_FOLDER)):
                if transcription_filename[0] != ".":
                    filename_without_extension = transcription_filename.split(".")[0]

                    categorical_labels_folder_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH)
                    evaluation_filenames = self.get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                    evaluation_files_full_paths_for_this_file = [os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                    evaluations_per_utterance = self.get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)

                    transcription_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.TRANSCRIPTION_FOLDER, transcription_filename)
                    with open(transcription_file_full_path,'r') as f:
                        contents = f.read()
                        lines = contents.split("\n")

                        # Iterate through utterances where every utterance looks like:
                        # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                        for line in lines:

                            # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                            line = line.strip()
                            if(len(line)==0):
                                break

                            # Remove idx of first space, ], -
                            try:
                                space_idx = line.index(" ")
                                timestampEndBracket_idx = line.index("]")
                                timestampHyphen_idx = line.index("-")
                            except:
                                self.errors["Problematic Transcription Line"]+=1
                                continue
                            else:
                                audio_filename = line[:space_idx]        # output audio file name = utterance name
                                text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                                evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")
                                if(evaluation=="KEY_ERROR"):
                                    self.errors["Unavailable Label for an utterance"]+=1

                                utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                                audio_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                                if evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True:
                                    dataset.append((text,audio_file_full_path,evaluation))
        return dataset

    def print_summary(self):
        print("SUMMARY:\n")
        for k,v in self.errors.items():
            print(f"{k}: {v}")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text, audio, label = self.dataset[idx]

        if self.transform:
            audio[0] = self.transform(audio[0])
        return text, librosa.load(audio), label

iemocap_dataset = IemocapDataset("../IEMOCAP_Dataset")

In [None]:
iemocap_dataset[0]

## Model

In [10]:
class AudioTextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioTextEmotionModel, self).__init__()
        ## sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(2048 + 768, 1024),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(1024, num_classes)
        )


    def forward(self, audio, text):
        combined = torch.cat([audio, text], axis=1)
        return self.fc(combined)


model = AudioTextEmotionModel(7)
model.to(device)


AudioTextEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=2816, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=1024, out_features=7, bias=True)
  )
)

## Loss Function and Optimizer
One final step before we can simply call `model.fit`

In [11]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

## Train!

In [12]:

# Create data loaders.
batch_size = 64
train_dataloader = DataLoader(meld_dataset, batch_size=batch_size)
# test_dataloader = DataLoader(iemocap_dataset, batch_size=batch_size)

In [13]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (text, spectrogram_data, label) in enumerate(dataloader):
        text, spectrogram_data, label = text.to(device), spectrogram_data.to(device), label.to(device)

        # Compute prediction error
        pred = model(spectrogram_data, text)
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(text)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

train(train_dataloader, model, loss_fn, optimizer)


torch.Size([64, 2048]) torch.Size([64, 768])
loss: 1.986375  [   64/ 9989]
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])
torch.Size([64, 2048]) torch.Size([64, 768])


FileNotFoundError: [Errno 2] No such file or directory: '../MELD_Dataset\\log_spectrogram\\train_audio\\dia125_utt3.png'

## Evaluation

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for text, audio, label in dataloader:
            audio = audio.to(device)
            text = text.to(device)
            label = label.to(device)
            outputs = model(audio, text)
            _, predicted = torch.max(outputs.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()
    return correct / total

In [None]:
# learn pytorch basic with some basic models and datasets
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/nnqs_tutorial.html