# Open Set Emotion Recognition

## Library Imports

In [3]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from collections import Counter, defaultdict
import torch.nn as nn
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
tqdm.pandas()
from tqdm import tqdm
import librosa
import re
from collections import Counter
import torch
from torchvision import models, transforms
from PIL import Image
from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchaudio
from transformers import HubertModel, HubertConfig
from sentence_transformers import SentenceTransformer
from functools import lru_cache

import utils

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: cuda


In [4]:
torch.manual_seed(42)
OTHER_LABEL = 6

## Dataset Creation

### MELD

In [4]:
# class MELDDataset(Dataset):
#     def __init__(self, meld_dir, split, transform=None):
#         train_df = pd.read_csv("../MELD_Dataset/train_sent_emo.csv")
#         labels = train_df['Emotion'].unique().tolist()
#         self.label_to_int = {label: i for i, label in enumerate(labels)}

#         self.meld_dir = meld_dir
#         self.transform = transform
#         self.split = split
#         self.img_path = os.path.join(self.meld_dir, 'mel_spectrograms', f'{self.split}_img')
#         self.img_path = os.path.join(self.meld_dir, 'log_spectrogram', f'{self.split}_audio')

#         # load and create sentence embeddings
#         self.dialogues = self.load_dialogues()
#         self.sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
#         sentences = self.dialogues['Utterance'].tolist()
#         sentences = [text.replace("\x92", "'") for text in sentences]
#         self.sentence_embeddings = self.sbert.encode(sentences, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

#         self.spectrograms = self.load_spectrograms()
#         self.resnet_model = models.resnet50(pretrained=True)
#         self.feature_extractor = torch.nn.Sequential(*list(self.resnet_model.children())[:-1]).to(device)
#         self.feature_extractor.eval()

#     def load_dialogues(self):
#         dialogue_file = os.path.join(self.meld_dir, f'{self.split}_sent_emo.csv')
#         dialogues = pd.read_csv(dialogue_file)
#         return dialogues

#     def load_spectrograms(self):
#         images = os.listdir(self.img_path)
#         return images

#     def __len__(self):
#         assert(len(self.sentence_embeddings) == len(self.spectrograms))
#         return len(self.dialogues)

#     def preprocess_img(self, img):
#         preprocessor = transforms.Compose([
#             transforms.Resize(256),
#             transforms.CenterCrop(224),
#             transforms.ToTensor(),
#         ])
#         img_t =  preprocessor(img).to(device)
#         return img_t

#     def extract_audio_features_from_spectrogram(self, img):
#         # Pass the input through the model
#         with torch.no_grad():
#             output = self.feature_extractor(img)
#         return output

#     def __getitem__(self, idx):
#         row = self.dialogues.iloc[idx]
#         text = self.sentence_embeddings[idx]
#         spectrogram_data = Image.open(os.path.join(self.img_path, f'dia{row["Dialogue_ID"]}_utt{row["Utterance_ID"]}.png'))
#         spectrogram_data = self.preprocess_img(spectrogram_data)
#         spectrogram_data = spectrogram_data[0:3, :, :]
#         spectrogram_data = spectrogram_data.unsqueeze(0)
#         spectrogram_data = self.extract_audio_features_from_spectrogram(spectrogram_data)
#         spectrogram_data = spectrogram_data.view(-1, 2048)[0]
#         label = row['Emotion']
#         label = torch.tensor(self.label_to_int[label])
#         return text, spectrogram_data, label

# train_meld = MELDDataset("../MELD_Dataset", "train")
# # test_meld = MELDDataset("../MELD_Dataset", "test")
# # dev_meld = MELDDataset("../MELD_Dataset", "dev")

# # concat all 3 datasets into 1 dataset
# meld_dataset = train_meld # + test_meld + dev_meld

In [5]:
# len(meld_dataset)

#### IEMOCAP

In [6]:
IEMOCAP_FULL_PATH = os.path.join(os.path.dirname(os.getcwd()),"IEMOCAP_full_release")
IEMOCAP_MAIN_FOLDER = os.path.join(IEMOCAP_FULL_PATH,"IEMOCAP_full_release")
TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
AUDIO_FOLDER = os.path.join("sentences", "wav")
CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
AUGMENTED_AUDIO_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "augmented")

In [7]:
def get_evaluator_filenames_with_video_file_prefix(input_list, prefix_value):
    regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
    matching_strings = [s for s in input_list if regex_pattern.match(s)]
    return matching_strings

def get_agreed_upon_evaluation(evaluations):
    top_two_frequent_evaluations = Counter(evaluations).most_common(2)
    if(len(top_two_frequent_evaluations)==1):
        return top_two_frequent_evaluations[0][0]
    most_frequent_evaluation_and_its_frequency, second_most_frequent_evaluation_and_its_frequency = top_two_frequent_evaluations
    highest_frequency = most_frequent_evaluation_and_its_frequency[1]
    second_highest_frequency = second_most_frequent_evaluation_and_its_frequency[1]
    if(highest_frequency==second_highest_frequency):
        return "AMBIGUOUS"
    else:
        return most_frequent_evaluation_and_its_frequency[0]

def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files):
    utterance_to_all_evaluations = {}

    for evaluation_file in evaluation_files:
        utterance_to_evaluationList = {}
        with open(evaluation_file,'r') as f:
            contents = f.read()
            utterance_evaluations = contents.split("\n")
            for evaluation in utterance_evaluations:
                evaluation = evaluation.strip()
                if(len(evaluation)==0):
                    continue
                matches = re.findall(r':[^;]+;', evaluation)
                matches = [match[1:-1] for match in matches]
                utterance_to_evaluationList[evaluation.split()[0]] = matches
        
        # Combine lists from dict1
        for key, value_list in utterance_to_evaluationList.items():
            utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

    utterance_to_evaluationsCounter = {k: get_agreed_upon_evaluation(v) for k,v in utterance_to_all_evaluations.items()}
    return utterance_to_evaluationsCounter

def is_label_a_closed_label(evaluation):
    return evaluation in ["Frustration","Excited","Neutral state","Anger","Sadness","Happiness"]

def create_unprocessed_dataset(is_closed_label_set_flag):
    dataset = []
    audio_files = []
    sentences_list = []
    for session_num in range(1,6):
        for transcription_filename in os.listdir(os.path.join(IEMOCAP_MAIN_FOLDER,f"Session{session_num}", TRANSCRIPTION_FOLDER)):
            if(transcription_filename[0]!="."): 

                filename_without_extension = transcription_filename.split(".")[0]
                
                categorical_labels_folder_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH)
                evaluation_filenames = get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                evaluation_files_full_paths_for_this_file = [os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                evaluations_per_utterance = get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)
                
                transcription_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", TRANSCRIPTION_FOLDER, transcription_filename) 
                with open(transcription_file_full_path,'r') as f:
                    contents = f.read()
                    lines = contents.split("\n")

                    # Iterate through utterances where every utterance looks like:
                    # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                    for line in lines:

                        # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                        line = line.strip()
                        if(len(line)==0):
                            break

                        # Remove idx of first space, ], -
                        try:
                            space_idx = line.index(" ")
                            timestampEndBracket_idx = line.index("]")
                            timestampHyphen_idx = line.index("-")
                        except:
                            continue
                        else:
                            audio_filename = line[:space_idx]        # output audio file name = utterance name
                            text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                            evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")

                            utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                            audio_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                            if(evaluation!="KEY_ERROR" and evaluation!= "AMBIGUOUS" and os.path.isfile(audio_file_full_path)==True and is_label_a_closed_label(evaluation)==is_closed_label_set_flag):
                            # if(evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True):    
                                audio_files.append(audio_file_full_path)
                                sentences_list.append(text)
                                dataset.append((text,audio_file_full_path,evaluation))
    return dataset, sentences_list, audio_files

In [8]:
openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles = create_unprocessed_dataset(is_closed_label_set_flag = False)
closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles= create_unprocessed_dataset(is_closed_label_set_flag = True)

In [9]:
closedSetUnprocessedDataset[5]

('Who told you to get in this line?',
 'd:\\Projects\\open-set-emotion-recognition\\IEMOCAP_full_release\\IEMOCAP_full_release\\Session1\\sentences\\wav\\Ses01F_impro01\\Ses01F_impro01_M002.wav',
 'Frustration')

In [10]:
CREATE_AND_SAVE_AUGMENTED_FILES = False

In [11]:
class IemocapDataset(Dataset):
    def __init__(self, unprocessed_dataset, sentences_list, audio_files, split, is_closed_label_set_flag, save_augmented_files = False ) -> None:
        self.unprocessed_dataset = unprocessed_dataset
        self.labels_to_int = {
            'Neutral state': 0,
            'Frustration': 1,
            'Anger': 2,
            'Sadness': 3,
            'Happiness': 4,
            'Excited': 5,
            'Surprise': 6,
            'Fear': 7,
            'Other': 8,
            'Disgust': 9
        }
        self.split = split
        self.save_augmented_files = save_augmented_files
        
        config = HubertConfig.from_pretrained("facebook/hubert-large-ls960-ft")
        self.hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft", config=config)

        self.sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
        self.sentence_embeddings = self.sbert.encode(self.sentences_list, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

        if self.split == "train":
            self.sentence_embeddings = np.repeat(self.sentence_embeddings, repeats=4, axis=0)

        
        self.audio_files = audio_files
        self.is_closed_label_set_flag = is_closed_label_set_flag

        

        if(self.split == "train"):
            self.AUGMENTED_AUDIO_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "augmented")
            self.unprocessed_dataset = self.create_augmented_dataset()

    def create_augmented_dataset(self):
        augmented_dataset = []
        os.makedirs(self.AUGMENTED_AUDIO_FOLDER,exist_ok=True)
        for text, audio_file_full_path, evaluation in self.unprocessed_dataset:
            _, audio_filename_and_extension = os.path.split(audio_file_full_path)
            audio_filename, extension = audio_filename_and_extension.split(".")
            for augmented_version_num in range(4):
                augmented_audio_path = os.path.join(self.AUGMENTED_AUDIO_FOLDER,f"{audio_filename}_version_{augmented_version_num}.{extension}")
                if(self.save_augmented_files):
                    utils.augment_audio_and_save(input_audio_path = audio_file_full_path, augmented_audio_path = augmented_audio_path)
                augmented_dataset.append((text, augmented_audio_path, evaluation))
        return augmented_dataset
    
    @lru_cache(maxsize=None)
    def cached_audio_features(self,audio_path):
        waveform, sample_rate = torchaudio.load(audio_path)

        # Resample if necessary (HuBERT uses 16 kHz sample rate)
        if sample_rate != 16000:
            waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            sample_rate = 16000

        # Ensure single channel audio (mono)
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Normalize audio
        waveform = waveform / torch.max(torch.abs(waveform))

        # Step 3: Encode audio waveforms using HuBERT model
        # Pass the waveform through the model
        with torch.no_grad():
            outputs = self.hubert_model(waveform)

        # Get the hidden states
        hidden_states = outputs.last_hidden_state
        frame_averaged_hidden_states = torch.mean(hidden_states, dim=1)
        frame_averaged_hidden_states = frame_averaged_hidden_states.view(-1,1024)[0]
        return frame_averaged_hidden_states
    
    def __len__(self):
        return len(self.unprocessed_dataset)
    
    def __getitem__(self, idx):
        _, audio, label = self.unprocessed_dataset[idx]

        text = self.sentence_embeddings[idx]
        audio_features = self.cached_audio_features(audio)

        if self.is_closed_label_set_flag==False:
            label = OTHER_LABEL
        else:
            label = self.labels_to_int[label]
        
        return text, audio_features, label

In [12]:
def get_stratified_split(unprocessed_dataset, sentences_list, audio_files, test_size):
    # instance[-1] is label
    labels = [instance[-1] for instance in unprocessed_dataset]
    unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files = train_test_split(unprocessed_dataset, sentences_list, audio_files, test_size=test_size, stratify=labels, random_state=42)
    return unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files

In [13]:
closed_set_unprocessed_train_dataset, temp_unprocessed_dataset, closed_set_train_sentences_list, temp_sentences_list, closed_set_train_audio_files, temp_audio_files = get_stratified_split(closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles, 0.2)
closed_set_unprocessed_val_dataset, closed_set_unprocessed_test_dataset, closed_set_val_sentences_list, closed_set_test_sentences_list, closed_set_val_audio_files, closed_set_test_audio_files = get_stratified_split(temp_unprocessed_dataset, temp_sentences_list, temp_audio_files, 0.5)
open_set_unprocessed_val_dataset, open_set_unprocessed_test_dataset, open_set_val_sentences_list, open_set_test_sentences_list, open_set_val_audio_files, open_set_test_audio_files = get_stratified_split(openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles, 0.5)
# entire_val_dataset = closed_set_unprocessed_val_dataset + open_set_unprocessed_val_dataset
# entire_test_dataset = closed_set_unprocessed_test_dataset + open_set_unprocessed_test_dataset

In [14]:
closed_set_train_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_train_dataset,
                                          sentences_list = closed_set_train_sentences_list,
                                          audio_files = closed_set_train_audio_files,
                                          split = "train",
                                          is_closed_label_set_flag = True,
                                          save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_val_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_val_dataset,
                                        sentences_list = closed_set_val_sentences_list,
                                        audio_files = closed_set_val_audio_files,
                                        split="val",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_test_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_test_dataset,
                                        sentences_list = closed_set_test_sentences_list,
                                        audio_files = closed_set_test_audio_files,
                                        split="test",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_val_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_val_dataset,
                                      sentences_list = open_set_val_sentences_list,
                                      audio_files = open_set_val_audio_files,
                                      split="val",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_test_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_test_dataset,
                                      sentences_list = open_set_test_sentences_list,
                                      audio_files = open_set_test_audio_files,
                                      split="test",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

Batches: 100%|██████████| 185/185 [01:32<00:00,  2.00it/s]
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Batches: 100%|██████████| 6/6 [00:05<00:00,  1.09it/s]
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Batches: 100%|██████████| 6/6 [00:05<00:00,  1.12it/s]
Some weights of HubertModel were not initialize

In [15]:
print(len(closed_set_train_dataset))
print(len(closed_set_val_dataset))
print(len(closed_set_test_dataset))
print(len(open_set_val_dataset))
print(len(open_set_test_dataset))


23616
738
738
76
76


In [16]:
entire_val_dataset = closed_set_val_dataset + open_set_val_dataset
entire_test_dataset = closed_set_test_dataset + open_set_test_dataset

In [17]:
len(entire_val_dataset), len(entire_test_dataset)

(814, 814)

In [18]:
# Create data loaders.
batch_size = 64

# AUDIO + TEXT DATALOADERS (CONSISTS OF BOTH OPEN AND CLOSED LABELS)
train_dataloader = DataLoader(closed_set_train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(entire_val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(entire_test_dataset, batch_size=batch_size, shuffle=True)

# AUDIO + TEXT DATALOADERS (ONLY CLOSED LABELS - USED FOR CHECKING MODEL PERFORMANCE AFTER KEEPING ASIDE THE OPEN SET CHALLENGE)
val_closed_set_dataloader = DataLoader(closed_set_val_dataset, batch_size=batch_size, shuffle=True)
test_closed_set_dataloader = DataLoader(closed_set_test_dataset, batch_size=batch_size, shuffle=True)



## Model Architectures Definition

In [19]:
# MODELS PATHS
MODELS_DIR = os.path.join(os.path.dirname(os.getcwd()), "MODELS")
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

TEXT_MODEL_PATH = os.path.join(MODELS_DIR, "text_model_augmented.pt")
AUDIO_MODEL_PATH = os.path.join(MODELS_DIR, "audio_model_augmented.pt")
MULTIMODAL_MODEL_PATH = os.path.join(MODELS_DIR, "multimodal_model_augmented.pt")

In [20]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='model_checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_acc_max = np.Inf

    def __call__(self, val_acc, model):

        if self.best_score is None:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
        elif val_acc < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
            self.counter = 0

    def save_checkpoint(self, val_acc, model):
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_acc

### Text Unimodal

In [21]:
class TextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(TextEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(768, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, text):
        return self.fc(text)


text_model = TextEmotionModel(6)
text_model.to(device)

TextEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=6, bias=True)
  )
)

### Audio Unimodal

In [22]:
class AudioEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, audio):
        return self.fc(audio)


audio_model = AudioEmotionModel(6)
audio_model.to(device)

AudioEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=6, bias=True)
  )
)

### Multimodal

In [23]:
class AudioTextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioTextEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(1024 + 768, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, text, audio):
        combined = torch.cat([audio, text], axis=1)
        return self.fc(combined)


model = AudioTextEmotionModel(6)
model.to(device)


AudioTextEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=1792, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=6, bias=True)
  )
)

### Model Try 2

In [24]:
# class AudioTextEmotionModel(nn.Module):
#     def __init__(self, text_input_size, audio_input_size, hidden_size, output_size):
#         super(AudioTextEmotionModel, self).__init__()
#         self.text_input_size = text_input_size
#         self.audio_input_size = audio_input_size
#         self.hidden_size = hidden_size
#         self.output_size = output_size
        
#         # Attention mechanism parameters
#         self.attention_weights = nn.Parameter(torch.randn(text_input_size, audio_input_size))
        
#         # Feedforward network
#         self.fc = nn.Sequential(
#             nn.Linear(text_input_size + audio_input_size, hidden_size),
#             nn.ReLU(),
#             nn.Linear(hidden_size, output_size)
#         )
        
#     def forward(self, text_embeddings, audio_embeddings):
#         # Compute attention scores
#         attention_scores = torch.matmul(text_embeddings, self.attention_weights)
#         attention_weights = F.softmax(attention_scores, dim=1)
        
#         # Compute attended representation
#         attended_representation = audio_embeddings * attention_weights
        
#         # Concatenate text embeddings and attended representation
#         concatenated_input = torch.cat((text_embeddings, attended_representation), dim=1)
        
#         # Feedforward network for classification
#         output = self.fc(concatenated_input)
#         return output

# # Example usage:
# text_input_size = 768  # Size of text embeddings
# audio_input_size = 2048  # Size of audio embeddings
# hidden_size = 512  # Size of the hidden layer in the feedforward network
# output_size = 6  # Number of classes for classification

# model = AudioTextEmotionModel(text_input_size, audio_input_size, hidden_size, output_size)
# model.to(device)


## Loss Function and Optimizer (of the 2 unimodal models and the multimodal model)

In [25]:
# TEXT ONLY 
loss_fn_text = nn.CrossEntropyLoss()
optimizer_text = torch.optim.RMSprop(text_model.parameters(), lr=1e-3, momentum=0.9)
scheduler_text = ReduceLROnPlateau(optimizer_text, mode='max', factor=0.1, patience=3, verbose=True)

# AUDIO ONLY
loss_fn_audio = nn.CrossEntropyLoss()
optimizer_audio = torch.optim.RMSprop(audio_model.parameters(), lr=1e-3, momentum=0.9)
scheduler_audio = ReduceLROnPlateau(optimizer_audio, mode='max', factor=0.1, patience=3, verbose=True)

# TEXT + AUDIO
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)

num_epochs = 100

## Train (the 2 unimodal models and the multimodal model on only closed set labels to test the performance. We have kept aside the open set challenge for now)

In [26]:
def accuracy(dataloader, model):
    size = len(dataloader.dataset)
    total_correct = 0
    model.eval()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        predicted = torch.argmax(pred,dim=1).cpu()
        label = x_and_y_device[-1]
        actual = label.cpu()
        correct = predicted == actual
        total_correct += correct.sum().item()
    return total_correct/size

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        label = x_and_y_device[-1]
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(x_and_y_device[0])
            
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

### Text Unimodal Training

In [None]:
# early_stopping_text = EarlyStopping(patience=9, delta=0, path=TEXT_MODEL_PATH)
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}\n-------------------------------")
#     train(train_dataloader_text_only, text_model, loss_fn_text, optimizer_text)
#     train_accuracy = accuracy(train_dataloader_text_only,text_model)
#     val_accuracy = accuracy(val_dataloader_text_only,text_model)
#     print(f"Accuracy on Train Set => {train_accuracy} | Accuracy on Closed Validation Set => {val_accuracy}")
#     scheduler_text.step(val_accuracy)
#     early_stopping_text(val_accuracy,text_model)
#     if early_stopping_text.early_stop:
#         print("Early stopping")
#         break
    
# print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
# text_model = TextEmotionModel(6)
# text_model.to(device)
# text_model.load_state_dict(torch.load(TEXT_MODEL_PATH))
# best_text_model_val_accuracy = accuracy(val_dataloader_text_only,text_model)
# print(f"Best Text Model's accuracy on Closed Validation Set => {best_text_model_val_accuracy}")

Epoch 1
-------------------------------
loss: 1.870664  [   64/ 5904]
Accuracy on Train Set => 0.5343834688346883 | Accuracy on Closed Validation Set => 0.46070460704607047
Epoch 2
-------------------------------
loss: 1.017479  [   64/ 5904]
Accuracy on Train Set => 0.6268631436314364 | Accuracy on Closed Validation Set => 0.4905149051490515
Epoch 3
-------------------------------
loss: 0.926881  [   64/ 5904]
Accuracy on Train Set => 0.6834349593495935 | Accuracy on Closed Validation Set => 0.4932249322493225
Epoch 4
-------------------------------
loss: 0.875097  [   64/ 5904]
Accuracy on Train Set => 0.7251016260162602 | Accuracy on Closed Validation Set => 0.5060975609756098
Epoch 5
-------------------------------
loss: 0.737137  [   64/ 5904]
Accuracy on Train Set => 0.7577913279132791 | Accuracy on Closed Validation Set => 0.5182926829268293
Epoch 6
-------------------------------
loss: 0.649189  [   64/ 5904]
Accuracy on Train Set => 0.7823509485094851 | Accuracy on Closed Vali

### Audio Unimodal Training

In [None]:
# early_stopping_audio = EarlyStopping(patience=9, delta=0, path=AUDIO_MODEL_PATH)
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}\n-------------------------------")
#     train(train_dataloader_audio_only, audio_model, loss_fn_audio, optimizer_audio)
#     train_accuracy = accuracy(train_dataloader_audio_only,audio_model)
#     val_accuracy = accuracy(val_dataloader_audio_only,audio_model)
#     print(f"Accuracy on Train Set => {train_accuracy} | Accuracy on Closed Validation Set => {val_accuracy}")
#     scheduler_audio.step(val_accuracy)
#     early_stopping_audio(val_accuracy,audio_model)
#     if early_stopping_audio.early_stop:
#         print("Early stopping")
#         break

# print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
# audio_model = AudioEmotionModel(6)
# audio_model.to(device)
# audio_model.load_state_dict(torch.load(AUDIO_MODEL_PATH))
# best_audio_model_val_accuracy = accuracy(val_dataloader_audio_only,audio_model)
# print(f"Best Audio Model's accuracy on Closed Validation Set => {best_audio_model_val_accuracy}")

Epoch 1
-------------------------------
loss: 1.835142  [   64/ 5904]


Accuracy on Train Set => 0.3680555555555556 | Accuracy on Closed Validation Set => 0.35365853658536583
Epoch 2
-------------------------------
loss: 1.523795  [   64/ 5904]
Accuracy on Train Set => 0.49983062330623307 | Accuracy on Closed Validation Set => 0.4573170731707317
Epoch 3
-------------------------------
loss: 1.288141  [   64/ 5904]
Accuracy on Train Set => 0.35958672086720866 | Accuracy on Closed Validation Set => 0.33062330623306235
EarlyStopping counter: 1 out of 9
Epoch 4
-------------------------------
loss: 1.146449  [   64/ 5904]
Accuracy on Train Set => 0.4261517615176152 | Accuracy on Closed Validation Set => 0.38685636856368566
EarlyStopping counter: 2 out of 9
Epoch 5
-------------------------------
loss: 1.085060  [   64/ 5904]
Accuracy on Train Set => 0.4969512195121951 | Accuracy on Closed Validation Set => 0.4349593495934959
EarlyStopping counter: 3 out of 9
Epoch 6
-------------------------------
loss: 0.969926  [   64/ 5904]
Accuracy on Train Set => 0.462567

### Multimodal Training

In [27]:
early_stopping = EarlyStopping(patience=9, delta=0, path=MULTIMODAL_MODEL_PATH)
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    print(f"Accuracy on Train Set => {accuracy(train_dataloader,model)} | Accuracy on Closed Validation Set => {accuracy(val_closed_set_dataloader,model)}")
    scheduler.step(accuracy(val_closed_set_dataloader,model))
    early_stopping(accuracy(val_closed_set_dataloader,model),model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
model = AudioTextEmotionModel(6)
model.to(device)
model.load_state_dict(torch.load(MULTIMODAL_MODEL_PATH))
best_multimodal_model_val_accuracy = accuracy(val_closed_set_dataloader,model)
print(f"Best Multimodal Model's accuracy on Closed Validation Set => {best_multimodal_model_val_accuracy}")

Epoch 1
-------------------------------


loss: 1.812544  [   64/23616]
loss: 1.355613  [ 6464/23616]
loss: 1.016742  [12864/23616]
loss: 0.800030  [19264/23616]


## Evaluation (of the multimodal model on closed and open set labels both)

In [23]:
def set_dropout_to_train(eval_model):
    for module in eval_model.modules():
        if isinstance(module, nn.Dropout):
            module.train()

def predict(label, model, text, spectrogram_data, n_simulations=100, threshold=1, other_label=OTHER_LABEL):
    predictions = [model(text, spectrogram_data).detach().cpu() for _ in range(n_simulations)]
    predictions = torch.stack(predictions)
    predictions = F.softmax(predictions, dim=2)

    mean_predictions = torch.mean(predictions,dim=0)
    std_predictions = torch.mean(torch.std(predictions,dim=0),dim=1)
    _,predicted_class = torch.max(mean_predictions,1)
    high_uncertainty = std_predictions>threshold
    predicted_class[high_uncertainty]=other_label
    return predicted_class

def evaluate(model, dataloader, device, threshold=0.6):
    # After setting the model to evaluation mode, call this function
    model.eval()
    set_dropout_to_train(model)

    size = len(dataloader.dataset)
    total_correct = 0
    total_confusion_matrix = torch.zeros((7,7))
    # total_correct_pred_of_other_label, total_actual_other_label = 0,0
    for batch, (text, spectrogram_data, label) in enumerate(dataloader):
        text, spectrogram_data, label = text.to(device), spectrogram_data.to(device), label.to(device)

        predicted = predict(label, model, text, spectrogram_data, threshold=threshold)
        predicted = predicted.cpu()
        actual = label.cpu()
        correct = predicted == actual
        total_correct += correct.sum().item()
        cm = confusion_matrix(predicted,actual)
        if(cm.shape[0]!=7 and cm.shape[1]!=7):
            row_of_zeros = np.zeros((7-cm.shape[0],cm.shape[1]))
            array_with_row = np.concatenate((cm, row_of_zeros), axis=0)

            # Add a column of zeros at the end
            column_of_zeros = np.zeros((7, 7-cm.shape[1]))
            array_with_row_and_column = np.concatenate((array_with_row, column_of_zeros), axis=1)
            cm = array_with_row_and_column

        total_confusion_matrix+= cm

    print(total_confusion_matrix)
    return total_correct/size

In [24]:
evaluate(model,val_dataloader,device,threshold=0.18)

tensor([[170.,  18.,   7.,  20.,   9.,  11.,  42.],
        [ 20., 110.,  11.,   8.,   3.,   6.,   6.],
        [  7.,  11.,  80.,   0.,   2.,   4.,  13.],
        [  8.,   6.,   0.,  72.,   2.,   1.,   4.],
        [  6.,   3.,   1.,   0.,  41.,   5.,   6.],
        [  4.,   9.,   4.,   3.,   7.,  67.,   5.],
        [  0.,   2.,   0.,   0.,   0.,   0.,   0.]], dtype=torch.float64)


0.6633906633906634

In [25]:
evaluate(model,test_dataloader,device,threshold=0.18)

tensor([[163.,  16.,   6.,  16.,  10.,   9.,  39.],
        [ 21., 115.,  17.,   2.,   3.,   9.,   9.],
        [  4.,  15.,  73.,   2.,   1.,   3.,  14.],
        [ 12.,   3.,   0.,  81.,   2.,   0.,   3.],
        [  3.,   5.,   3.,   1.,  41.,   7.,   4.],
        [ 12.,   5.,   4.,   0.,   7.,  65.,   7.],
        [  1.,   0.,   0.,   0.,   1.,   0.,   0.]], dtype=torch.float64)


0.6609336609336609

In [34]:
# learn pytorch basic with some basic models and datasets
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/nnqs_tutorial.html