# Open Set Emotion Recognition

## Library Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from collections import Counter
import torch.nn as nn
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
tqdm.pandas()
from tqdm import tqdm
import re
from collections import Counter
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchaudio
from transformers import HubertModel, HubertConfig
from sentence_transformers import SentenceTransformer
from functools import lru_cache
import utils

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
torch.manual_seed(42)
print(f"Using device: {device}")

Using device: cuda


In [2]:
ALL_LABELS = ["Neutral state","Frustration","Anger","Sadness","Happiness","Excited","Surprise","Fear","Other","Disgust"]
LABELS_TO_INT = {label: i for i, label in enumerate(ALL_LABELS)}
CLOSED_LABELS = ALL_LABELS[:6]
OTHER_LABEL = len(CLOSED_LABELS)
CREATE_AND_SAVE_AUGMENTED_FILES = False

#### Audio Features Using Hubert

In [3]:
SAVED_DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), "SAVED_DATA")
if not os.path.exists(SAVED_DATA_DIR):
    os.makedirs(SAVED_DATA_DIR)

SAVED_HUBERT_EMBEDDINGS = os.path.join(SAVED_DATA_DIR, "hubert_embeddings.pickle")

if os.path.exists(SAVED_HUBERT_EMBEDDINGS):
    hubert_embeddings = utils.load_data_using_pickle(SAVED_HUBERT_EMBEDDINGS)
else:
    hubert_embeddings = {} # audio path to embedding vector dict

HUBERT_MODEL_NAME = "facebook/hubert-large-ls960-ft"
# HUBERT_MODEL_NAME = "facebook/hubert-base-ls960"
HUBERT_MODEL_DIMENSION = 1024 # or 768
config = HubertConfig.from_pretrained(HUBERT_MODEL_NAME)
hubert_model = HubertModel.from_pretrained(HUBERT_MODEL_NAME, config=config)
sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)

def save_hubert_embeddings():
    utils.save_data_using_pickle(hubert_embeddings, SAVED_HUBERT_EMBEDDINGS)

def get_hidden_states_from_audio_and_save(audio_path):
    if os.path.basename(audio_path) in hubert_embeddings:
        return hubert_embeddings[os.path.basename(audio_path)]
    
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if necessary (HuBERT uses 16 kHz sample rate)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
        sample_rate = 16000

    # Ensure single channel audio (mono)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Normalize audio
    waveform = waveform / torch.max(torch.abs(waveform))

    # Step 3: Encode audio waveforms using HuBERT model
    # Pass the waveform through the model
    with torch.no_grad():
        outputs = hubert_model(waveform)

    # Get the hidden states
    hidden_states = outputs.last_hidden_state
    hubert_embeddings[os.path.basename(audio_path)] = hidden_states
    return hidden_states

def get_audio_features(audio_paths):
    audio_embeddings_list = []
    for audio_path in tqdm(audio_paths):
        hidden_states = get_hidden_states_from_audio_and_save(audio_path)
        frame_averaged_hidden_states = torch.mean(hidden_states, dim=1)
        frame_averaged_hidden_states = frame_averaged_hidden_states.view(-1,HUBERT_MODEL_DIMENSION)[0]
        audio_embeddings_list.append(frame_averaged_hidden_states)
    # save_hubert_embeddings()
    return audio_embeddings_list

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset Creation

### MELD

In [4]:
# class MELDDataset(Dataset):
#     def __init__(self, meld_dir, split, transform=None):
#         train_df = pd.read_csv("../MELD_Dataset/train_sent_emo.csv")
#         labels = train_df['Emotion'].unique().tolist()
#         self.label_to_int = {label: i for i, label in enumerate(labels)}

#         self.meld_dir = meld_dir
#         self.transform = transform
#         self.split = split
#         self.img_path = os.path.join(self.meld_dir, 'mel_spectrograms', f'{self.split}_img')
#         self.img_path = os.path.join(self.meld_dir, 'log_spectrogram', f'{self.split}_audio')

#         # load and create sentence embeddings
#         self.dialogues = self.load_dialogues()
#         sentences = self.dialogues['Utterance'].tolist()
#         sentences = [text.replace("\x92", "'") for text in sentences]
#         self.sentence_embeddings = sbert.encode(sentences, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

#         self.spectrograms = self.load_spectrograms()
#         self.resnet_model = models.resnet50(pretrained=True)
#         self.feature_extractor = torch.nn.Sequential(*list(self.resnet_model.children())[:-1]).to(device)
#         self.feature_extractor.eval()

#     def load_dialogues(self):
#         dialogue_file = os.path.join(self.meld_dir, f'{self.split}_sent_emo.csv')
#         dialogues = pd.read_csv(dialogue_file)
#         return dialogues

#     def load_spectrograms(self):
#         images = os.listdir(self.img_path)
#         return images

#     def __len__(self):
#         assert(len(self.sentence_embeddings) == len(self.spectrograms))
#         return len(self.dialogues)

#     def preprocess_img(self, img):
#         preprocessor = transforms.Compose([
#             transforms.Resize(256),
#             transforms.CenterCrop(224),
#             transforms.ToTensor(),
#         ])
#         img_t =  preprocessor(img).to(device)
#         return img_t

#     def extract_audio_features_from_spectrogram(self, img):
#         # Pass the input through the model
#         with torch.no_grad():
#             output = self.feature_extractor(img)
#         return output

#     def __getitem__(self, idx):
#         row = self.dialogues.iloc[idx]
#         text = self.sentence_embeddings[idx]
#         spectrogram_data = Image.open(os.path.join(self.img_path, f'dia{row["Dialogue_ID"]}_utt{row["Utterance_ID"]}.png'))
#         spectrogram_data = self.preprocess_img(spectrogram_data)
#         spectrogram_data = spectrogram_data[0:3, :, :]
#         spectrogram_data = spectrogram_data.unsqueeze(0)
#         spectrogram_data = self.extract_audio_features_from_spectrogram(spectrogram_data)
#         spectrogram_data = spectrogram_data.view(-1, 2048)[0]
#         label = row['Emotion']
#         label = torch.tensor(self.label_to_int[label])
#         return text, spectrogram_data, label

# train_meld = MELDDataset("../MELD_Dataset", "train")
# # test_meld = MELDDataset("../MELD_Dataset", "test")
# # dev_meld = MELDDataset("../MELD_Dataset", "dev")

# # concat all 3 datasets into 1 dataset
# meld_dataset = train_meld # + test_meld + dev_meld

In [5]:
# len(meld_dataset)

#### IEMOCAP

In [4]:
IEMOCAP_FULL_PATH = os.path.join(os.path.dirname(os.getcwd()),"IEMOCAP_full_release")
IEMOCAP_MAIN_FOLDER = os.path.join(IEMOCAP_FULL_PATH,"IEMOCAP_full_release")
TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
AUDIO_FOLDER = os.path.join("sentences", "wav")
CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
AUGMENTED_AUDIO_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "augmented")

In [5]:
def get_evaluator_filenames_with_video_file_prefix(input_list, prefix_value):
    regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
    matching_strings = [s for s in input_list if regex_pattern.match(s)]
    return matching_strings

def get_agreed_upon_evaluation(evaluations):
    top_two_frequent_evaluations = Counter(evaluations).most_common(2)
    if(len(top_two_frequent_evaluations)==1):
        return top_two_frequent_evaluations[0][0]
    most_frequent_evaluation_and_its_frequency, second_most_frequent_evaluation_and_its_frequency = top_two_frequent_evaluations
    highest_frequency = most_frequent_evaluation_and_its_frequency[1]
    second_highest_frequency = second_most_frequent_evaluation_and_its_frequency[1]
    if(highest_frequency==second_highest_frequency):
        return "AMBIGUOUS"
    else:
        return most_frequent_evaluation_and_its_frequency[0]

def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files):
    utterance_to_all_evaluations = {}

    for evaluation_file in evaluation_files:
        utterance_to_evaluationList = {}
        with open(evaluation_file,'r') as f:
            contents = f.read()
            utterance_evaluations = contents.split("\n")
            for evaluation in utterance_evaluations:
                evaluation = evaluation.strip()
                if(len(evaluation)==0):
                    continue
                matches = re.findall(r':[^;]+;', evaluation)
                matches = [match[1:-1] for match in matches]
                utterance_to_evaluationList[evaluation.split()[0]] = matches
        
        # Combine lists from dict1
        for key, value_list in utterance_to_evaluationList.items():
            utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

    utterance_to_evaluationsCounter = {k: get_agreed_upon_evaluation(v) for k,v in utterance_to_all_evaluations.items()}
    return utterance_to_evaluationsCounter

def is_label_a_closed_label(evaluation):
    return evaluation in CLOSED_LABELS

def create_unprocessed_dataset(is_closed_label_set_flag):
    dataset = []
    audio_files = []
    sentences_list = []
    for session_num in range(1,6):
        for transcription_filename in os.listdir(os.path.join(IEMOCAP_MAIN_FOLDER,f"Session{session_num}", TRANSCRIPTION_FOLDER)):
            if(transcription_filename[0]!="."): 

                filename_without_extension = transcription_filename.split(".")[0]
                
                categorical_labels_folder_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH)
                evaluation_filenames = get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                evaluation_files_full_paths_for_this_file = [os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                evaluations_per_utterance = get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)
                
                transcription_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", TRANSCRIPTION_FOLDER, transcription_filename) 
                with open(transcription_file_full_path,'r') as f:
                    contents = f.read()
                    lines = contents.split("\n")

                    # Iterate through utterances where every utterance looks like:
                    # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                    for line in lines:

                        # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                        line = line.strip()
                        if(len(line)==0):
                            break

                        # Remove idx of first space, ], -
                        try:
                            space_idx = line.index(" ")
                            timestampEndBracket_idx = line.index("]")
                            timestampHyphen_idx = line.index("-")
                        except:
                            continue
                        else:
                            audio_filename = line[:space_idx]        # output audio file name = utterance name
                            text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                            evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")

                            utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                            audio_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                            if(evaluation!="KEY_ERROR" and evaluation!= "AMBIGUOUS" and os.path.isfile(audio_file_full_path)==True and is_label_a_closed_label(evaluation)==is_closed_label_set_flag):
                            # if(evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True):    
                                audio_files.append(audio_file_full_path)
                                sentences_list.append(text)
                                dataset.append((text,audio_file_full_path,evaluation))

    dataset = dataset[:1000]
    audio_files = audio_files[:1000]
    sentences_list = sentences_list[:1000]
    return dataset, sentences_list, audio_files

In [6]:
openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles = create_unprocessed_dataset(is_closed_label_set_flag = False)
closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles= create_unprocessed_dataset(is_closed_label_set_flag = True)

In [7]:
closedSetUnprocessedDataset[5]

('Who told you to get in this line?',
 'd:\\Projects\\open-set-emotion-recognition\\IEMOCAP_full_release\\IEMOCAP_full_release\\Session1\\sentences\\wav\\Ses01F_impro01\\Ses01F_impro01_M002.wav',
 'Frustration')

In [8]:
class IemocapDataset(Dataset):
    def __init__(self, unprocessed_dataset, sentences_list, audio_files, split, is_closed_label_set_flag, save_augmented_files ) -> None:
        self.unprocessed_dataset = unprocessed_dataset
        self.is_closed_label_set_flag = is_closed_label_set_flag
        self.split = split
        self.sentence_embeddings = sbert.encode(sentences_list, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

        self.save_augmented_files = save_augmented_files
        self.augment_times = 4
        # we augment just for train, so we are repeating the sentences as only audios are augmented
        # if self.split == "train":
        #     self.sentence_embeddings = np.repeat(self.sentence_embeddings, repeats=self.augment_times, axis=0)
        #     self.unprocessed_dataset = self.create_augmented_dataset()

        all_audio_files = [instance[1] for instance in self.unprocessed_dataset]
        assert(all([os.path.exists(audio_file) for audio_file in all_audio_files]), "Some audio files do not exist")
        self.audio_embeddings = get_audio_features(all_audio_files)


    def create_augmented_dataset(self):
        augmented_dataset = []
        os.makedirs(AUGMENTED_AUDIO_FOLDER, exist_ok=True)
        for text, audio_file_full_path, evaluation in self.unprocessed_dataset:
            _, audio_filename_and_extension = os.path.split(audio_file_full_path)
            audio_filename, extension = audio_filename_and_extension.split(".")
            for augmented_version_num in range(self.augment_times):
                augmented_audio_path = os.path.join(AUGMENTED_AUDIO_FOLDER,f"{audio_filename}_version_{augmented_version_num}.{extension}")
                if self.save_augmented_files:
                    utils.augment_audio_and_save(input_audio_path = audio_file_full_path, augmented_audio_path = augmented_audio_path)
                else:
                    assert(os.path.exists(augmented_audio_path), f"Augmented Audio Path {augmented_audio_path} not exist")
                augmented_dataset.append((text, augmented_audio_path, evaluation))
        return augmented_dataset
    
    def __len__(self):
        return len(self.unprocessed_dataset)
    
    def __getitem__(self, idx):
        _, audio, label = self.unprocessed_dataset[idx]
        # device = "cpu"
        text = self.sentence_embeddings[idx].to(device)
        audio = self.audio_embeddings[idx].to(device)
        combined = torch.cat([audio, text])
        if self.is_closed_label_set_flag==False:
            label = OTHER_LABEL
        else:
            label = LABELS_TO_INT[label]

        label_vec = np.zeros(len(CLOSED_LABELS)+1)
        label_vec[label] = 1
        label_vec = torch.tensor(label_vec).to(device)
        
        return combined, label_vec

In [9]:
def get_stratified_split(unprocessed_dataset, sentences_list, audio_files, test_size):
    # instance[-1] is label
    labels = [instance[-1] for instance in unprocessed_dataset]
    unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files = train_test_split(unprocessed_dataset, sentences_list, audio_files, test_size=test_size, stratify=labels, random_state=42)
    return unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files

In [10]:
closed_set_unprocessed_train_dataset, temp_unprocessed_dataset, closed_set_train_sentences_list, temp_sentences_list, closed_set_train_audio_files, temp_audio_files = get_stratified_split(closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles, 0.2)
closed_set_unprocessed_val_dataset, closed_set_unprocessed_test_dataset, closed_set_val_sentences_list, closed_set_test_sentences_list, closed_set_val_audio_files, closed_set_test_audio_files = get_stratified_split(temp_unprocessed_dataset, temp_sentences_list, temp_audio_files, 0.5)
open_set_unprocessed_val_dataset, open_set_unprocessed_test_dataset, open_set_val_sentences_list, open_set_test_sentences_list, open_set_val_audio_files, open_set_test_audio_files = get_stratified_split(openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles, 0.5)
# entire_val_dataset = closed_set_unprocessed_val_dataset + open_set_unprocessed_val_dataset
# entire_test_dataset = closed_set_unprocessed_test_dataset + open_set_unprocessed_test_dataset

In [11]:
closed_set_train_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_train_dataset,
                                          sentences_list = closed_set_train_sentences_list,
                                          audio_files = closed_set_train_audio_files,
                                          split = "train",
                                          is_closed_label_set_flag = True,
                                          save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_val_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_val_dataset,
                                        sentences_list = closed_set_val_sentences_list,
                                        audio_files = closed_set_val_audio_files,
                                        split="val",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_test_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_test_dataset,
                                        sentences_list = closed_set_test_sentences_list,
                                        audio_files = closed_set_test_audio_files,
                                        split="test",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_val_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_val_dataset,
                                      sentences_list = open_set_val_sentences_list,
                                      audio_files = open_set_val_audio_files,
                                      split="val",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_test_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_test_dataset,
                                      sentences_list = open_set_test_sentences_list,
                                      audio_files = open_set_test_audio_files,
                                      split="test",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

Batches: 100%|██████████| 7/7 [00:16<00:00,  2.33s/it]
100%|██████████| 800/800 [01:25<00:00,  9.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.10it/s]
100%|██████████| 100/100 [00:14<00:00,  6.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.85it/s]
100%|██████████| 100/100 [00:17<00:00,  5.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.87it/s]
100%|██████████| 76/76 [00:57<00:00,  1.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.75it/s]
100%|██████████| 76/76 [00:56<00:00,  1.35it/s]


In [12]:
print(len(closed_set_train_dataset))
print(len(closed_set_val_dataset))
print(len(closed_set_test_dataset))
print(len(open_set_val_dataset))
print(len(open_set_test_dataset))


800
100
100
76
76


In [13]:
entire_val_dataset = closed_set_val_dataset + open_set_val_dataset
entire_test_dataset = closed_set_test_dataset + open_set_test_dataset

In [14]:
len(entire_val_dataset), len(entire_test_dataset)

(176, 176)

In [15]:
# Create data loaders.
batch_size = 64

# AUDIO + TEXT DATALOADERS (CONSISTS OF BOTH OPEN AND CLOSED LABELS)
train_dataloader = DataLoader(closed_set_train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(entire_val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(entire_test_dataset, batch_size=batch_size, shuffle=True)

# AUDIO + TEXT DATALOADERS (ONLY CLOSED LABELS - USED FOR CHECKING MODEL PERFORMANCE AFTER KEEPING ASIDE THE OPEN SET CHALLENGE)
val_closed_set_dataloader = DataLoader(closed_set_val_dataset, batch_size=batch_size, shuffle=True)
test_closed_set_dataloader = DataLoader(closed_set_test_dataset, batch_size=batch_size, shuffle=True)



## Model Architectures Definition

In [16]:
# MODELS PATHS
MODELS_DIR = os.path.join(os.path.dirname(os.getcwd()), "MODELS")
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

TEXT_MODEL_PATH = os.path.join(MODELS_DIR, "text_model_augmented.pt")
AUDIO_MODEL_PATH = os.path.join(MODELS_DIR, "audio_model_augmented.pt")
MULTIMODAL_MODEL_PATH = os.path.join(MODELS_DIR, "multimodal_model_augmented.pt")

In [17]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='model_checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_acc_max = np.Inf

    def __call__(self, val_acc, model):

        if self.best_score is None:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
        elif val_acc < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
            self.counter = 0

    def save_checkpoint(self, val_acc, model):
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_acc

### Text Unimodal

In [18]:
class TextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(TextEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(768, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, text):
        return self.fc(text)


text_model = TextEmotionModel(6)
text_model.to(device)

TextEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=6, bias=True)
  )
)

### Audio Unimodal

In [19]:
class AudioEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, audio):
        return self.fc(audio)


audio_model = AudioEmotionModel(6)
audio_model.to(device)

AudioEmotionModel(
  (fc): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=6, bias=True)
  )
)

### Multimodal

In [20]:
class AudioTextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioTextEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(HUBERT_MODEL_DIMENSION + 768, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, text_audio_combined):
        return self.fc(text_audio_combined)

## Loss Function and Optimizer (of the 2 unimodal models and the multimodal model)

In [21]:
# TEXT ONLY 
loss_fn_text = nn.CrossEntropyLoss()
optimizer_text = torch.optim.RMSprop(text_model.parameters(), lr=1e-3, momentum=0.9)
scheduler_text = ReduceLROnPlateau(optimizer_text, mode='max', factor=0.1, patience=3, verbose=True)

# AUDIO ONLY
loss_fn_audio = nn.CrossEntropyLoss()
optimizer_audio = torch.optim.RMSprop(audio_model.parameters(), lr=1e-3, momentum=0.9)
scheduler_audio = ReduceLROnPlateau(optimizer_audio, mode='max', factor=0.1, patience=3, verbose=True)

num_epochs = 100

## Train (the 2 unimodal models and the multimodal model on only closed set labels to test the performance. We have kept aside the open set challenge for now)

In [22]:
def accuracy(dataloader, model):
    size = len(dataloader.dataset)
    total_correct = 0
    model.eval()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        predicted = torch.argmax(pred,dim=1).cpu()
        label = x_and_y_device[-1]
        actual = label.cpu()
        correct = predicted == actual
        total_correct += correct.sum().item()
    return total_correct/size

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        label = x_and_y_device[-1]
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            loss, current = loss.item(), (batch + 1) * len(x_and_y_device[0])
            
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

### Text Unimodal Training

In [23]:
# early_stopping_text = EarlyStopping(patience=9, delta=0, path=TEXT_MODEL_PATH)
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}\n-------------------------------")
#     train(train_dataloader_text_only, text_model, loss_fn_text, optimizer_text)
#     train_accuracy = accuracy(train_dataloader_text_only,text_model)
#     val_accuracy = accuracy(val_dataloader_text_only,text_model)
#     print(f"Accuracy on Train Set => {train_accuracy} | Accuracy on Closed Validation Set => {val_accuracy}")
#     scheduler_text.step(val_accuracy)
#     early_stopping_text(val_accuracy,text_model)
#     if early_stopping_text.early_stop:
#         print("Early stopping")
#         break
    
# print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
# text_model = TextEmotionModel(6)
# text_model.to(device)
# text_model.load_state_dict(torch.load(TEXT_MODEL_PATH))
# best_text_model_val_accuracy = accuracy(val_dataloader_text_only,text_model)
# print(f"Best Text Model's accuracy on Closed Validation Set => {best_text_model_val_accuracy}")

### Audio Unimodal Training

In [24]:
# early_stopping_audio = EarlyStopping(patience=9, delta=0, path=AUDIO_MODEL_PATH)
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}\n-------------------------------")
#     train(train_dataloader_audio_only, audio_model, loss_fn_audio, optimizer_audio)
#     train_accuracy = accuracy(train_dataloader_audio_only,audio_model)
#     val_accuracy = accuracy(val_dataloader_audio_only,audio_model)
#     print(f"Accuracy on Train Set => {train_accuracy} | Accuracy on Closed Validation Set => {val_accuracy}")
#     scheduler_audio.step(val_accuracy)
#     early_stopping_audio(val_accuracy,audio_model)
#     if early_stopping_audio.early_stop:
#         print("Early stopping")
#         break

# print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
# audio_model = AudioEmotionModel(6)
# audio_model.to(device)
# audio_model.load_state_dict(torch.load(AUDIO_MODEL_PATH))
# best_audio_model_val_accuracy = accuracy(val_dataloader_audio_only,audio_model)
# print(f"Best Audio Model's accuracy on Closed Validation Set => {best_audio_model_val_accuracy}")

In [25]:
import torchosr
from torchosr.models import Openmax, TSoftmax
import torch

lower_stack=nn.Sequential(
            nn.Linear(1792, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 64)
        )
# Initialize method
epsilon = torchosr.utils.base.get_softmax_epsilon(6)
print (epsilon)
model = TSoftmax(lower_stack=lower_stack, n_known=6, epsilon=epsilon)
epsilon = torchosr.utils.base.get_openmax_epsilon(6)
print (epsilon)
model = Openmax(lower_stack=lower_stack, n_known=6, epsilon=epsilon)
model.to(device)

# TEXT + AUDIO
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)

early_stopping = EarlyStopping(patience=9, delta=0, path=MULTIMODAL_MODEL_PATH)
for epoch in range(40):
    print(f"Epoch {epoch+1}\n-------------------------------")
    model.train(train_dataloader, loss_fn, optimizer)

    # Test
    print( model.test(train_dataloader, loss_fn))
    print( model.test(test_dataloader, loss_fn))

    # print(f"Accuracy on Train Set => {accuracy(train_dataloader,model)} | Accuracy on Closed Validation Set => {accuracy(val_closed_set_dataloader,model)}")
    # scheduler.step(accuracy(val_closed_set_dataloader,model))
    # early_stopping(accuracy(val_closed_set_dataloader,model),model)
    # if early_stopping.early_stop:
    #     print("Early stopping")
    #     break

# best_multimodal_model_val_accuracy = accuracy(val_closed_set_dataloader,model)
# print(f"Best Multimodal Model's accuracy on Closed Validation Set => {best_multimodal_model_val_accuracy}")

0.25
0.1
Epoch 1
-------------------------------


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [88]:
for batch, (X, y) in enumerate(test_dataloader):
    pred = model.predict(X)
    y = torch.argmax(y,dim=1)
    print ((pred==y).sum().item()/len(y))

0.3125
0.390625
0.3958333333333333


In [135]:
# Import transforms for pre-processing
from torchvision import transforms

# Load MNIST dataset
data = torchosr.data.base_datasets.MNIST_base(root = 'data', download = True, transform = transforms.Compose([transforms.Resize(28),transforms.ToTensor()]))

from torch.utils.data import DataLoader

# Select KKC and UUC from configuration
kkc, uuc = config[0]

# Get training and testing data for first out of 5 folds
train_data, test_data = torchosr.data.get_train_test(data, kkc, uuc, root = 'data', tunning = False, fold = 0, n_folds = 5, seed = 1234)

# Create DataLoaders
train_data_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=64, shuffle=True)

In [144]:
next(iter(train_data_loader))[1][:,-1]

config, openness = torchosr.data.configure_division(data, n_openness = 3, repeats = 3, seed = 1234)
kkc, uuc = config[0]
print (len(kkc))

3


In [134]:
# get first batch of dataloader
pred = model(next(iter(train_dataloader))[0]).shape
next(iter(train_dataloader))[1].shape

torch.Size([64])

### Multimodal Training

In [157]:
model = AudioTextEmotionModel(6)
model.to(device)

# TEXT + AUDIO
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)

early_stopping = EarlyStopping(patience=9, delta=0, path=MULTIMODAL_MODEL_PATH)
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    print(f"Accuracy on Train Set => {accuracy(train_dataloader,model)} | Accuracy on Closed Validation Set => {accuracy(val_closed_set_dataloader,model)}")
    scheduler.step(accuracy(val_closed_set_dataloader,model))
    early_stopping(accuracy(val_closed_set_dataloader,model),model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
model = AudioTextEmotionModel(6)
model.to(device)
model.load_state_dict(torch.load(MULTIMODAL_MODEL_PATH))
best_multimodal_model_val_accuracy = accuracy(val_closed_set_dataloader,model)
print(f"Best Multimodal Model's accuracy on Closed Validation Set => {best_multimodal_model_val_accuracy}")

Epoch 1
-------------------------------
loss: 0.011866  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.58
Epoch 2
-------------------------------
loss: 0.009267  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.57
EarlyStopping counter: 1 out of 9
Epoch 3
-------------------------------
loss: 0.043240  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.59
Epoch 4
-------------------------------
loss: 0.052966  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.58
EarlyStopping counter: 1 out of 9
Epoch 5
-------------------------------
loss: 0.019091  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.58
EarlyStopping counter: 2 out of 9
Epoch 6
-------------------------------
loss: 0.010346  [   64/  800]
Accuracy on Train Set => 1.0 | Accuracy on Closed Validation Set => 0.58
EarlyStopping counter: 3 out of 9
Epoch 

## Evaluation (of the multimodal model on closed and open set labels both)

In [158]:
def set_dropout_to_train(eval_model):
    for module in eval_model.modules():
        if isinstance(module, nn.Dropout):
            module.train()

def predict(label, model, text, spectrogram_data, n_simulations=100, threshold=1, other_label=OTHER_LABEL):
    predictions = [model(text, spectrogram_data).detach().cpu() for _ in range(n_simulations)]
    predictions = torch.stack(predictions)
    predictions = F.softmax(predictions, dim=2)

    mean_predictions = torch.mean(predictions,dim=0)
    std_predictions = torch.mean(torch.std(predictions,dim=0),dim=1)
    _,predicted_class = torch.max(mean_predictions,1)
    high_uncertainty = std_predictions>threshold
    predicted_class[high_uncertainty]=other_label
    return predicted_class

def evaluate(model, dataloader, device, threshold=0.6):
    # After setting the model to evaluation mode, call this function
    model.eval()
    set_dropout_to_train(model)

    size = len(dataloader.dataset)
    total_correct = 0
    total_confusion_matrix = torch.zeros((7,7))
    # total_correct_pred_of_other_label, total_actual_other_label = 0,0
    for batch, (text, spectrogram_data, label) in enumerate(dataloader):
        text, spectrogram_data, label = text.to(device), spectrogram_data.to(device), label.to(device)

        predicted = predict(label, model, text, spectrogram_data, threshold=threshold)
        predicted = predicted.cpu()
        actual = label.cpu()
        correct = predicted == actual
        total_correct += correct.sum().item()
        cm = confusion_matrix(predicted,actual)
        if(cm.shape[0]!=7 and cm.shape[1]!=7):
            row_of_zeros = np.zeros((7-cm.shape[0],cm.shape[1]))
            array_with_row = np.concatenate((cm, row_of_zeros), axis=0)

            # Add a column of zeros at the end
            column_of_zeros = np.zeros((7, 7-cm.shape[1]))
            array_with_row_and_column = np.concatenate((array_with_row, column_of_zeros), axis=1)
            cm = array_with_row_and_column

        total_confusion_matrix+= cm

    print(total_confusion_matrix)
    return total_correct/size

In [177]:
evaluate(model,val_dataloader,device,threshold=0.18)

tensor([[23.,  3.,  1.,  5.,  3.,  2., 30.],
        [ 5., 17.,  4.,  3.,  0.,  2., 24.],
        [ 1.,  0.,  6.,  0.,  1.,  0.,  8.],
        [ 1.,  0.,  0.,  6.,  0.,  0.,  3.],
        [ 1.,  1.,  1.,  0.,  3.,  3.,  3.],
        [ 2.,  0.,  1.,  0.,  1.,  4.,  8.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=torch.float64)


0.3352272727272727

In [178]:
evaluate(model,test_dataloader,device,threshold=0.18)

tensor([[21.,  6.,  1.,  4.,  2.,  2., 32.],
        [ 5., 12.,  3.,  0.,  1.,  0., 19.],
        [ 1.,  1.,  9.,  0.,  1.,  0.,  8.],
        [ 4.,  0.,  0.,  8.,  0.,  0.,  3.],
        [ 1.,  1.,  0.,  1.,  4.,  1.,  8.],
        [ 1.,  0.,  0.,  0.,  1.,  9.,  6.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=torch.float64)


0.35795454545454547

In [34]:
# learn pytorch basic with some basic models and datasets
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/nnqs_tutorial.html

In [180]:
import numpy as np
import scipy.spatial.distance as spd
import torch

import libmr


def calc_distance(query_score, mcv, eu_weight, distance_type='eucos'):
    if distance_type == 'eucos':
        query_distance = spd.euclidean(mcv, query_score) * eu_weight + \
            spd.cosine(mcv, query_score)
    elif distance_type == 'euclidean':
        query_distance = spd.euclidean(mcv, query_score)
    elif distance_type == 'cosine':
        query_distance = spd.cosine(mcv, query_score)
    else:
        print("distance type not known: enter either of eucos, euclidean or cosine")
    return query_distance


def fit_weibull(means, dists, categories, tailsize=20, distance_type='eucos'):
    """
    Input:
        means (C, channel, C)
        dists (N_c, channel, C) * C
    Output:
        weibull_model : Perform EVT based analysis using tails of distances and save
                        weibull model parameters for re-adjusting softmax scores
    """
    weibull_model = {}
    for mean, dist, category_name in zip(means, dists, categories):
        weibull_model[category_name] = {}
        weibull_model[category_name]['distances_{}'.format(distance_type)] = dist[distance_type]
        weibull_model[category_name]['mean_vec'] = mean
        weibull_model[category_name]['weibull_model'] = []
        for channel in range(mean.shape[0]):
            mr = libmr.MR()
            tailtofit = np.sort(dist[distance_type][channel, :])[-tailsize:]
            mr.fit_high(tailtofit, len(tailtofit))
            weibull_model[category_name]['weibull_model'].append(mr)

    return weibull_model


def query_weibull(category_name, weibull_model, distance_type='eucos'):
    return [weibull_model[category_name]['mean_vec'],
            weibull_model[category_name]['distances_{}'.format(distance_type)],
            weibull_model[category_name]['weibull_model']]


def compute_openmax_prob(scores, scores_u):
    prob_scores, prob_unknowns = [], []
    for s, su in zip(scores, scores_u):
        channel_scores = np.exp(s)
        channel_unknown = np.exp(np.sum(su))

        total_denom = np.sum(channel_scores) + channel_unknown
        prob_scores.append(channel_scores / total_denom)
        prob_unknowns.append(channel_unknown / total_denom)

    # Take channel mean
    scores = np.mean(prob_scores, axis=0)
    unknowns = np.mean(prob_unknowns, axis=0)
    modified_scores = scores.tolist() + [unknowns]
    return modified_scores


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


def openmax(weibull_model, categories, input_score, eu_weight, alpha=10, distance_type='eucos'):
    """Re-calibrate scores via OpenMax layer
    Output:
        openmax probability and softmax probability
    """
    nb_classes = len(categories)

    ranked_list = input_score.argsort().ravel()[::-1][:alpha]
    alpha_weights = [((alpha + 1) - i) / float(alpha) for i in range(1, alpha + 1)]
    omega = np.zeros(nb_classes)
    omega[ranked_list] = alpha_weights

    scores, scores_u = [], []
    for channel, input_score_channel in enumerate(input_score):
        score_channel, score_channel_u = [], []
        for c, category_name in enumerate(categories):
            mav, dist, model = query_weibull(category_name, weibull_model, distance_type)
            channel_dist = calc_distance(input_score_channel, mav[channel], eu_weight, distance_type)
            wscore = model[channel].w_score(channel_dist)
            modified_score = input_score_channel[c] * (1 - wscore * omega[c])
            score_channel.append(modified_score)
            score_channel_u.append(input_score_channel[c] - modified_score)

        scores.append(score_channel)
        scores_u.append(score_channel_u)

    scores = np.asarray(scores)
    scores_u = np.asarray(scores_u)

    openmax_prob = np.array(compute_openmax_prob(scores, scores_u))
    softmax_prob = softmax(np.array(input_score.ravel()))
    return openmax_prob, softmax_prob


def compute_channel_distances(mavs, features, eu_weight=0.5):
    """
    Input:
        mavs (channel, C)
        features: (N, channel, C)
    Output:
        channel_distances: dict of distance distribution from MAV for each channel.
    """
    eucos_dists, eu_dists, cos_dists = [], [], []
    for channel, mcv in enumerate(mavs):  # Compute channel specific distances
        eu_dists.append([spd.euclidean(mcv, feat[channel]) for feat in features])
        cos_dists.append([spd.cosine(mcv, feat[channel]) for feat in features])
        eucos_dists.append([spd.euclidean(mcv, feat[channel]) * eu_weight +
                            spd.cosine(mcv, feat[channel]) for feat in features])

    return {'eucos': np.array(eucos_dists), 'cosine': np.array(cos_dists), 'euclidean': np.array(eu_dists)}


def compute_train_score_and_mavs_and_dists(train_class_num,trainloader,device,net):
    scores = [[] for _ in range(train_class_num)]
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)

            # this must cause error for cifar
            _, outputs = net(inputs)
            for score, t in zip(outputs, targets):
                # print(f"torch.argmax(score) is {torch.argmax(score)}, t is {t}")
                if torch.argmax(score) == t:
                    scores[t].append(score.unsqueeze(dim=0).unsqueeze(dim=0))
    scores = [torch.cat(x).cpu().numpy() for x in scores]  # (N_c, 1, C) * C
    mavs = np.array([np.mean(x, axis=0) for x in scores])  # (C, 1, C)
    dists = [compute_channel_distances(mcv, score) for mcv, score in zip(mavs, scores)]
    return scores, mavs, dists

def predict_with_openmax(model, input, device, weibull_models, mean_activations):
    model.eval()
    with torch.no_grad():
        input = input.to(device)
        activations = model.fc1(input)
        activations = model.relu(activations)

    scores = openmax_score(weibull_models, mean_activations, activations.numpy())
    return np.argmax(scores), scores
