# Open Set Emotion Recognition

## Library Imports

In [30]:
import warnings
warnings.filterwarnings("ignore")
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from collections import Counter
import torch.nn as nn
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
tqdm.pandas()
from tqdm import tqdm
import re
from collections import Counter
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchaudio
from transformers import HubertModel, HubertConfig
from sentence_transformers import SentenceTransformer
from functools import lru_cache
import utils
import torchosr
from torchosr.models import Openmax, TSoftmax
from torch.utils.tensorboard import SummaryWriter

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
torch.manual_seed(42)
print(f"Using device: {device}")

Using device: cuda


In [2]:
ALL_LABELS = ["Neutral state","Frustration","Anger","Sadness","Happiness","Excited","Surprise","Fear","Other","Disgust"]
LABELS_TO_INT = {label: i for i, label in enumerate(ALL_LABELS)}
CLOSED_LABELS = ALL_LABELS[:5]
OTHER_LABEL = len(CLOSED_LABELS)
CREATE_AND_SAVE_AUGMENTED_FILES = False

#### Audio Features Using Hubert

In [3]:
SAVED_DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), "SAVED_DATA")
if not os.path.exists(SAVED_DATA_DIR):
    os.makedirs(SAVED_DATA_DIR)

SAVED_HUBERT_EMBEDDINGS = os.path.join(SAVED_DATA_DIR, "hubert_embeddings.pickle")

if os.path.exists(SAVED_HUBERT_EMBEDDINGS):
    hubert_embeddings = utils.load_data_using_pickle(SAVED_HUBERT_EMBEDDINGS)
else:
    hubert_embeddings = {} # audio path to embedding vector dict

HUBERT_MODEL_NAME = "facebook/hubert-large-ls960-ft"
# HUBERT_MODEL_NAME = "facebook/hubert-base-ls960"
HUBERT_MODEL_DIMENSION = 1024 # or 768
config = HubertConfig.from_pretrained(HUBERT_MODEL_NAME)
hubert_model = HubertModel.from_pretrained(HUBERT_MODEL_NAME, config=config)
sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)

def save_hubert_embeddings():
    utils.save_data_using_pickle(hubert_embeddings, SAVED_HUBERT_EMBEDDINGS)

def get_hidden_states_from_audio_and_save(audio_path):
    if os.path.basename(audio_path) in hubert_embeddings:
        return hubert_embeddings[os.path.basename(audio_path)]
    
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if necessary (HuBERT uses 16 kHz sample rate)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
        sample_rate = 16000

    # Ensure single channel audio (mono)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Normalize audio
    waveform = waveform / torch.max(torch.abs(waveform))

    # Step 3: Encode audio waveforms using HuBERT model
    # Pass the waveform through the model
    with torch.no_grad():
        outputs = hubert_model(waveform)

    # Get the hidden states
    hidden_states = outputs.last_hidden_state
    hubert_embeddings[os.path.basename(audio_path)] = hidden_states
    return hidden_states

def get_audio_features(audio_paths):
    audio_embeddings_list = []
    for audio_path in tqdm(audio_paths):
        hidden_states = get_hidden_states_from_audio_and_save(audio_path)
        frame_averaged_hidden_states = torch.mean(hidden_states, dim=1)
        frame_averaged_hidden_states = frame_averaged_hidden_states.view(-1,HUBERT_MODEL_DIMENSION)[0]
        audio_embeddings_list.append(frame_averaged_hidden_states)
    # save_hubert_embeddings()
    return audio_embeddings_list

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset Creation

#### IEMOCAP

In [6]:
IEMOCAP_FULL_PATH = os.path.join(os.path.dirname(os.getcwd()),"IEMOCAP_full_release")
IEMOCAP_MAIN_FOLDER = os.path.join(IEMOCAP_FULL_PATH,"IEMOCAP_full_release")
TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
AUDIO_FOLDER = os.path.join("sentences", "wav")
CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
AUGMENTED_AUDIO_FOLDER = os.path.join(os.path.dirname(os.getcwd()), "augmented")

In [7]:
def get_evaluator_filenames_with_video_file_prefix(input_list, prefix_value):
    regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
    matching_strings = [s for s in input_list if regex_pattern.match(s)]
    return matching_strings

def get_agreed_upon_evaluation(evaluations):
    top_two_frequent_evaluations = Counter(evaluations).most_common(2)
    if(len(top_two_frequent_evaluations)==1):
        return top_two_frequent_evaluations[0][0]
    most_frequent_evaluation_and_its_frequency, second_most_frequent_evaluation_and_its_frequency = top_two_frequent_evaluations
    highest_frequency = most_frequent_evaluation_and_its_frequency[1]
    second_highest_frequency = second_most_frequent_evaluation_and_its_frequency[1]
    if(highest_frequency==second_highest_frequency):
        return "AMBIGUOUS"
    else:
        return most_frequent_evaluation_and_its_frequency[0]

def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files):
    utterance_to_all_evaluations = {}

    for evaluation_file in evaluation_files:
        utterance_to_evaluationList = {}
        with open(evaluation_file,'r') as f:
            contents = f.read()
            utterance_evaluations = contents.split("\n")
            for evaluation in utterance_evaluations:
                evaluation = evaluation.strip()
                if(len(evaluation)==0):
                    continue
                matches = re.findall(r':[^;]+;', evaluation)
                matches = [match[1:-1] for match in matches]
                utterance_to_evaluationList[evaluation.split()[0]] = matches
        
        # Combine lists from dict1
        for key, value_list in utterance_to_evaluationList.items():
            utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

    utterance_to_evaluationsCounter = {k: get_agreed_upon_evaluation(v) for k,v in utterance_to_all_evaluations.items()}
    return utterance_to_evaluationsCounter

def is_label_a_closed_label(evaluation):
    return evaluation in CLOSED_LABELS

def create_unprocessed_dataset(is_closed_label_set_flag):
    dataset = []
    audio_files = []
    sentences_list = []
    for session_num in range(1,6):
        for transcription_filename in os.listdir(os.path.join(IEMOCAP_MAIN_FOLDER,f"Session{session_num}", TRANSCRIPTION_FOLDER)):
            if(transcription_filename[0]!="."): 

                filename_without_extension = transcription_filename.split(".")[0]
                
                categorical_labels_folder_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH)
                evaluation_filenames = get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                evaluation_files_full_paths_for_this_file = [os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                evaluations_per_utterance = get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)
                
                transcription_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", TRANSCRIPTION_FOLDER, transcription_filename) 
                with open(transcription_file_full_path,'r') as f:
                    contents = f.read()
                    lines = contents.split("\n")

                    # Iterate through utterances where every utterance looks like:
                    # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                    for line in lines:

                        # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                        line = line.strip()
                        if(len(line)==0):
                            break

                        # Remove idx of first space, ], -
                        try:
                            space_idx = line.index(" ")
                            timestampEndBracket_idx = line.index("]")
                            timestampHyphen_idx = line.index("-")
                        except:
                            continue
                        else:
                            audio_filename = line[:space_idx]        # output audio file name = utterance name
                            text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                            evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")

                            utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                            audio_file_full_path = os.path.join(IEMOCAP_MAIN_FOLDER, f"Session{session_num}", AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                            if(evaluation!="KEY_ERROR" and evaluation!= "AMBIGUOUS" and os.path.isfile(audio_file_full_path)==True and is_label_a_closed_label(evaluation)==is_closed_label_set_flag):
                            # if(evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True):    
                                audio_files.append(audio_file_full_path)
                                sentences_list.append(text)
                                dataset.append((text,audio_file_full_path,evaluation))
    return dataset, sentences_list, audio_files

In [8]:
openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles = create_unprocessed_dataset(is_closed_label_set_flag = False)
closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles= create_unprocessed_dataset(is_closed_label_set_flag = True)

In [9]:
closedSetUnprocessedDataset[5]

('Who told you to get in this line?',
 'd:\\Projects\\open-set-emotion-recognition\\IEMOCAP_full_release\\IEMOCAP_full_release\\Session1\\sentences\\wav\\Ses01F_impro01\\Ses01F_impro01_M002.wav',
 'Frustration')

In [10]:
class IemocapDataset(Dataset):
    def __init__(self, unprocessed_dataset, sentences_list, audio_files, split, is_closed_label_set_flag, save_augmented_files ) -> None:
        self.unprocessed_dataset = unprocessed_dataset
        self.is_closed_label_set_flag = is_closed_label_set_flag
        self.split = split
        self.sentence_embeddings = sbert.encode(sentences_list, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)

        self.save_augmented_files = save_augmented_files
        self.augment_times = 4
        # we augment just for train, so we are repeating the sentences as only audios are augmented
        # if self.split == "train":
        #     self.sentence_embeddings = np.repeat(self.sentence_embeddings, repeats=self.augment_times, axis=0)
        #     self.unprocessed_dataset = self.create_augmented_dataset()

        all_audio_files = [instance[1] for instance in self.unprocessed_dataset]
        assert(all([os.path.exists(audio_file) for audio_file in all_audio_files]), "Some audio files do not exist")
        self.audio_embeddings = get_audio_features(all_audio_files)


    def create_augmented_dataset(self):
        augmented_dataset = []
        os.makedirs(AUGMENTED_AUDIO_FOLDER, exist_ok=True)
        for text, audio_file_full_path, evaluation in self.unprocessed_dataset:
            _, audio_filename_and_extension = os.path.split(audio_file_full_path)
            audio_filename, extension = audio_filename_and_extension.split(".")
            for augmented_version_num in range(self.augment_times):
                augmented_audio_path = os.path.join(AUGMENTED_AUDIO_FOLDER,f"{audio_filename}_version_{augmented_version_num}.{extension}")
                if self.save_augmented_files:
                    utils.augment_audio_and_save(input_audio_path = audio_file_full_path, augmented_audio_path = augmented_audio_path)
                else:
                    assert(os.path.exists(augmented_audio_path), f"Augmented Audio Path {augmented_audio_path} not exist")
                augmented_dataset.append((text, augmented_audio_path, evaluation))
        return augmented_dataset
    
    def __len__(self):
        return len(self.unprocessed_dataset)
    
    def __getitem__(self, idx):
        _, audio, label = self.unprocessed_dataset[idx]
        local_device = "cuda"
        if(self.split in ["val","test"]):
            local_device = "cpu"
        text = self.sentence_embeddings[idx].to(local_device)
        audio = self.audio_embeddings[idx].to(local_device)
        combined = torch.cat([audio, text])
        if self.is_closed_label_set_flag==False:
            label = OTHER_LABEL
        else:
            label = LABELS_TO_INT[label]

        label_vec = np.zeros(len(CLOSED_LABELS)+1)
        label_vec[label] = 1
        label_vec = torch.tensor(label_vec).to(local_device)
        
        return combined, label_vec

In [11]:
def get_stratified_split(unprocessed_dataset, sentences_list, audio_files, test_size):
    # instance[-1] is label
    labels = [instance[-1] for instance in unprocessed_dataset]
    unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files = train_test_split(unprocessed_dataset, sentences_list, audio_files, test_size=test_size, stratify=labels, random_state=42)
    return unprocessed_train_dataset, unprocessed_test_dataset, train_sentences_list, test_sentences_list, train_audio_files, test_audio_files

In [12]:
closed_set_unprocessed_train_dataset, temp_unprocessed_dataset, closed_set_train_sentences_list, temp_sentences_list, closed_set_train_audio_files, temp_audio_files = get_stratified_split(closedSetUnprocessedDataset, closedSetSentencesList, closedSetAudioFiles, 0.2)
closed_set_unprocessed_val_dataset, closed_set_unprocessed_test_dataset, closed_set_val_sentences_list, closed_set_test_sentences_list, closed_set_val_audio_files, closed_set_test_audio_files = get_stratified_split(temp_unprocessed_dataset, temp_sentences_list, temp_audio_files, 0.5)
open_set_unprocessed_val_dataset, open_set_unprocessed_test_dataset, open_set_val_sentences_list, open_set_test_sentences_list, open_set_val_audio_files, open_set_test_audio_files = get_stratified_split(openSetUnprocessedDataset, openSetSentencesList, openSetAudioFiles, 0.5)

In [13]:
closed_set_train_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_train_dataset,
                                          sentences_list = closed_set_train_sentences_list,
                                          audio_files = closed_set_train_audio_files,
                                          split = "train",
                                          is_closed_label_set_flag = True,
                                          save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_val_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_val_dataset,
                                        sentences_list = closed_set_val_sentences_list,
                                        audio_files = closed_set_val_audio_files,
                                        split="val",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

closed_set_test_dataset = IemocapDataset(unprocessed_dataset = closed_set_unprocessed_test_dataset,
                                        sentences_list = closed_set_test_sentences_list,
                                        audio_files = closed_set_test_audio_files,
                                        split="test",
                                        is_closed_label_set_flag=True,
                                        save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_val_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_val_dataset,
                                      sentences_list = open_set_val_sentences_list,
                                      audio_files = open_set_val_audio_files,
                                      split="val",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

open_set_test_dataset = IemocapDataset(unprocessed_dataset = open_set_unprocessed_test_dataset,
                                      sentences_list = open_set_test_sentences_list,
                                      audio_files = open_set_test_audio_files,
                                      split="test",
                                      is_closed_label_set_flag=False,
                                      save_augmented_files=CREATE_AND_SAVE_AUGMENTED_FILES)

Batches: 100%|██████████| 40/40 [00:31<00:00,  1.27it/s]
100%|██████████| 5071/5071 [12:49<00:00,  6.59it/s]  
Batches: 100%|██████████| 5/5 [00:04<00:00,  1.22it/s]
100%|██████████| 634/634 [02:54<00:00,  3.64it/s]
Batches: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
100%|██████████| 634/634 [02:54<00:00,  3.63it/s]
Batches: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
100%|██████████| 596/596 [02:29<00:00,  4.00it/s]
Batches: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
100%|██████████| 597/597 [03:14<00:00,  3.08it/s]


In [14]:
print(len(closed_set_train_dataset))
print(len(closed_set_val_dataset))
print(len(closed_set_test_dataset))
print(len(open_set_val_dataset))
print(len(open_set_test_dataset))


5071
634
634
596
597


In [15]:
entire_val_dataset = closed_set_val_dataset + open_set_val_dataset
entire_test_dataset = closed_set_test_dataset + open_set_test_dataset

In [16]:
len(entire_val_dataset), len(entire_test_dataset)

(1230, 1231)

In [17]:
# Create data loaders.
batch_size = 64

# AUDIO + TEXT DATALOADERS (CONSISTS OF BOTH OPEN AND CLOSED LABELS)
train_dataloader = DataLoader(closed_set_train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(entire_val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(entire_test_dataset, batch_size=batch_size, shuffle=True)

# AUDIO + TEXT DATALOADERS (ONLY CLOSED LABELS - USED FOR CHECKING MODEL PERFORMANCE AFTER KEEPING ASIDE THE OPEN SET CHALLENGE)
val_closed_set_dataloader = DataLoader(closed_set_val_dataset, batch_size=batch_size, shuffle=True)
test_closed_set_dataloader = DataLoader(closed_set_test_dataset, batch_size=batch_size, shuffle=True)



## Model Architectures Definition

In [18]:
# MODELS PATHS
MODELS_DIR = os.path.join(os.path.dirname(os.getcwd()), "MODELS")
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

MULTIMODAL_MODEL_PATH = os.path.join(MODELS_DIR, "multimodal_model_augmented.pt")
NEW_MULTIMODAL_MODEL_PATH = os.path.join(MODELS_DIR, "multimodal_model_without_happiness.pt")
MULTIMODAL_MODEL_OPENMAX_PATH = os.path.join(MODELS_DIR, "multimodal_model_openmax.pt")
MULTIMODAL_MODEL_TSOFTMAX_PATH = os.path.join(MODELS_DIR, "multimodal_model_tsoftmax.pt")
MULTIMODAL_MODEL_OVERLAYSOFTMAX_PATH = os.path.join(MODELS_DIR, "multimodal_model_overlaysoftmax.pt")

In [19]:
class AudioTextEmotionModel(nn.Module):
    def __init__(self, num_classes):
        super(AudioTextEmotionModel, self).__init__()
        # sequential model with 2 layers, followed by dropout and relu layers and output layer
        self.fc = nn.Sequential(
            nn.Linear(HUBERT_MODEL_DIMENSION + 768, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )


    def forward(self, text_audio_combined):
        return self.fc(text_audio_combined)

## Model Training

In [20]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='model_checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_acc_max = np.Inf

    def __call__(self, val_acc, model):

        if self.best_score is None:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
        elif val_acc < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_acc
            self.save_checkpoint(val_acc, model)
            self.counter = 0

    def save_checkpoint(self, val_acc, model):
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_acc

In [31]:
def accuracy(dataloader, model):
    size = len(dataloader.dataset)
    total_correct = 0
    model.eval()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        predicted = torch.argmax(pred,dim=1).cpu()
        _,label = torch.max(x_and_y_device[-1],1)
        actual = label.cpu()
        correct = predicted == actual
        total_correct += correct.sum().item()
    return total_correct/size

def get_classification_report(dataloader, model):
    predicted_list = []
    actual_list = []
    model.eval()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        predicted = torch.argmax(pred,dim=1).cpu()
        _,label = torch.max(x_and_y_device[-1],1)
        actual = label.cpu()
        predicted_list.extend(predicted.tolist())
        actual_list.extend(actual.tolist())
    return classification_report(actual_list,predicted_list)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, x_and_y in enumerate(dataloader):
        x_and_y_device = [x_and_y[i].to(device) for i in range(len(x_and_y))]

        # Compute prediction error
        pred = model(*x_and_y_device[:-1])
        label = x_and_y_device[-1]
        _, label = torch.max(label,1)
        loss = loss_fn(pred, label)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            loss, current = loss.item(), (batch + 1) * len(x_and_y_device[0])
            
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [22]:
model = AudioTextEmotionModel(OTHER_LABEL)
model.to(device)

# TEXT + AUDIO
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)
num_epochs = 100

writer = SummaryWriter()

early_stopping = EarlyStopping(patience=9, delta=0, path=NEW_MULTIMODAL_MODEL_PATH)
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    train_closed_set_accuracy = accuracy(train_dataloader,model)
    val_closed_set_accuracy = accuracy(val_closed_set_dataloader,model)
    print(f"Accuracy on Train Set => {train_closed_set_accuracy} | Accuracy on Closed Validation Set => {val_closed_set_accuracy}")
    
    writer.add_scalar('Accuracy/train', train_closed_set_accuracy, epoch)
    writer.add_scalar('Accuracy/val', val_closed_set_accuracy, epoch)

    scheduler.step(val_closed_set_accuracy)
    early_stopping(val_closed_set_accuracy,model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

print(f"MODEL TRAINED...RE-LOADING BEST MODEL")
model = AudioTextEmotionModel(OTHER_LABEL)
model.to(device)
model.load_state_dict(torch.load(NEW_MULTIMODAL_MODEL_PATH))
best_multimodal_model_val_accuracy = accuracy(val_closed_set_dataloader,model)
print(f"Best Multimodal Model's accuracy on Closed Validation Set => {best_multimodal_model_val_accuracy}")

Epoch 1
-------------------------------
loss: 1.703944  [   64/ 5071]
loss: 2.608342  [  704/ 5071]
loss: 1.816437  [ 1344/ 5071]
loss: 1.484013  [ 1984/ 5071]
loss: 1.335314  [ 2624/ 5071]
loss: 1.160007  [ 3264/ 5071]
loss: 1.309810  [ 3904/ 5071]
loss: 1.327434  [ 4544/ 5071]
Accuracy on Train Set => 0.5493985407217511 | Accuracy on Closed Validation Set => 0.49369085173501576
Epoch 2
-------------------------------
loss: 1.092633  [   64/ 5071]
loss: 1.023347  [  704/ 5071]
loss: 1.096565  [ 1344/ 5071]
loss: 1.038752  [ 1984/ 5071]
loss: 1.004268  [ 2624/ 5071]
loss: 1.490084  [ 3264/ 5071]
loss: 0.768193  [ 3904/ 5071]
loss: 0.912989  [ 4544/ 5071]
Accuracy on Train Set => 0.6880299743640308 | Accuracy on Closed Validation Set => 0.5946372239747634
Epoch 3
-------------------------------
loss: 0.890696  [   64/ 5071]
loss: 1.106116  [  704/ 5071]
loss: 0.836708  [ 1344/ 5071]
loss: 1.050000  [ 1984/ 5071]
loss: 0.843960  [ 2624/ 5071]
loss: 1.058308  [ 3264/ 5071]
loss: 0.869038 

In [152]:
model = AudioTextEmotionModel(OTHER_LABEL)
model.to(device)
model.load_state_dict(torch.load(NEW_MULTIMODAL_MODEL_PATH))
best_multimodal_model_val_clf_report = get_classification_report(val_closed_set_dataloader,model)
print(f"Best Multimodal Model's classification report on Closed Validation Set =>\n {best_multimodal_model_val_clf_report}")

Best Multimodal Model's classification report on Closed Validation Set =>
               precision    recall  f1-score   support

           0       0.57      0.68      0.62       171
           1       0.59      0.61      0.60       185
           2       0.65      0.50      0.56       110
           3       0.69      0.63      0.66       109
           4       0.62      0.59      0.61        59

    accuracy                           0.61       634
   macro avg       0.63      0.60      0.61       634
weighted avg       0.62      0.61      0.61       634



## MONTE CARLO DROPOUT

In [153]:
def set_dropout_to_train(eval_model):
    for module in eval_model.modules():
        if isinstance(module, nn.Dropout):
            module.train()

def predict(label, model, combined_data, n_simulations=100, threshold=1, other_label=OTHER_LABEL):
    predictions = [model(combined_data).detach().cpu() for _ in range(n_simulations)]
    predictions = torch.stack(predictions)
    predictions = F.softmax(predictions, dim=2)

    mean_predictions = torch.mean(predictions,dim=0)
    std_predictions = torch.mean(torch.std(predictions,dim=0),dim=1)
    _,predicted_class = torch.max(mean_predictions,1)
    high_uncertainty = std_predictions>threshold
    predicted_class[high_uncertainty]=other_label
    return predicted_class

def evaluate(model, dataloader, device, threshold=0.6):
    # After setting the model to evaluation mode, call this function
    model.eval()
    set_dropout_to_train(model)

    size = len(dataloader.dataset)
    total_correct = 0
    predicted_list = []
    actual_list = []
    total_confusion_matrix = torch.zeros((OTHER_LABEL+1,OTHER_LABEL+1))
    # total_correct_pred_of_other_label, total_actual_other_label = 0,0
    for batch, (combined_data, label) in enumerate(dataloader):
        combined_data, label = combined_data.to(device), label.to(device)

        predicted = predict(label, model, combined_data, threshold=threshold)
        predicted = predicted.cpu()
        _, actual = torch.max(label.cpu(),1)
        correct = predicted == actual
        total_correct += correct.sum().item()
        predicted_list.extend(predicted.tolist())
        actual_list.extend(actual.tolist())
        cm = confusion_matrix(predicted,actual)
        if(cm.shape[0]!=(OTHER_LABEL+1) and cm.shape[1]!=(OTHER_LABEL+1)):
            row_of_zeros = np.zeros((OTHER_LABEL+1-cm.shape[0],cm.shape[1]))
            array_with_row = np.concatenate((cm, row_of_zeros), axis=0)

            # Add a column of zeros at the end
            column_of_zeros = np.zeros((OTHER_LABEL+1, OTHER_LABEL+1-cm.shape[1]))
            array_with_row_and_column = np.concatenate((array_with_row, column_of_zeros), axis=1)
            cm = array_with_row_and_column

        total_confusion_matrix+= cm
    print(f"Confusion Matrix:")
    print(total_confusion_matrix)
    print(f"classification Report:")
    print(classification_report(actual_list,predicted_list))
    return total_correct/size

In [159]:
mc_dropout_threshold = 0.1
print("Test Set:")
print(f"Accuracy: {evaluate(model,test_dataloader,device,threshold=mc_dropout_threshold)}")

Test Set:
Confusion Matrix:
tensor([[ 93.,  23.,   5.,   9.,   9., 127.],
        [ 21.,  79.,  23.,   6.,   3.,  71.],
        [  6.,  20.,  56.,   0.,   1.,  29.],
        [  8.,   4.,   0.,  76.,   2.,   7.],
        [ 10.,   1.,   1.,   0.,  31., 164.],
        [ 33.,  58.,  25.,  17.,  14., 199.]], dtype=torch.float64)
classification Report:
              precision    recall  f1-score   support

           0       0.35      0.54      0.43       171
           1       0.39      0.43      0.41       185
           2       0.50      0.51      0.50       110
           3       0.78      0.70      0.74       108
           4       0.15      0.52      0.23        60
           5       0.58      0.33      0.42       597

    accuracy                           0.43      1231
   macro avg       0.46      0.51      0.46      1231
weighted avg       0.51      0.43      0.45      1231

Accuracy: 0.4337936636880585


## TEMPERATURE SCALING

In [40]:
def predict_temp(model, combined_data,threshold=1, temperature=1.0):
    with torch.no_grad():
        predictions = model(combined_data)
        
        # Apply temperature scaling
        predictions = predictions/temperature
        
        probabilities = F.softmax(predictions, dim=1)
        max_val, predicted_class = torch.max(probabilities, 1)

        predicted_class[max_val<threshold] = OTHER_LABEL

        return predicted_class

def evaluate_temp(model, dataloader, device, threshold, temperature):
    model.eval()
    size = len(dataloader.dataset)
    total_confusion_matrix = torch.zeros((OTHER_LABEL+1,OTHER_LABEL+1))
    total_correct = 0
    predicted_list = []
    actual_list = []
    for combined_data, label in dataloader:
        combined_data, label = combined_data.to(device), label.to(device)

        predicted = predict_temp(model, combined_data, threshold, temperature)
        predicted = predicted.cpu()
        _, actual = torch.max(label.cpu(),1)
        correct = (predicted == actual)
        total_correct += correct.sum().item()
        predicted_list.extend(predicted.tolist())
        actual_list.extend(actual.tolist())
        cm = confusion_matrix(predicted,actual)
        if(cm.shape[0]!=(OTHER_LABEL+1) and cm.shape[1]!=(OTHER_LABEL+1)):
            row_of_zeros = np.zeros((OTHER_LABEL+1-cm.shape[0],cm.shape[1]))
            array_with_row = np.concatenate((cm, row_of_zeros), axis=0)
            column_of_zeros = np.zeros((OTHER_LABEL+1, OTHER_LABEL+1-cm.shape[1]))
            array_with_row_and_column = np.concatenate((array_with_row, column_of_zeros), axis=1)
            cm = array_with_row_and_column

        total_confusion_matrix+= cm
    return total_correct/size, total_confusion_matrix, classification_report(actual_list,predicted_list)

In [162]:
def select_best(model, dataloader, device, thresholds, temperatures):
    best_accuracy = 0
    best_params = {'threshold': 0, 'temperature': 0}
    results = []

    for threshold in thresholds:
        for temperature in temperatures:
            accuracy, _, _ = evaluate_temp(model, dataloader, device, threshold, temperature)
            results.append((threshold, temperature, accuracy))
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params['threshold'] = threshold
                best_params['temperature'] = temperature
    
    return best_params, best_accuracy, results

thresholds = np.linspace(0.1, 0.75, 20)  
temperatures = np.linspace(0.5, 2, 16)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_params, best_accuracy, results = select_best(model, val_dataloader, device, thresholds, temperatures)
print(f'Best Parameters: {best_params} with an accuracy of {best_accuracy}')

Best Parameters: {'threshold': 0.75, 'temperature': 2.0} with an accuracy of 0.4869918699186992


In [164]:
test_accuracy_temp_scaling, test_cm_temp_scaling, test_clf_report_temp_scaling = evaluate_temp(model, test_dataloader, device, best_params["threshold"], best_params["temperature"])
print(f"Accuracy: {test_accuracy_temp_scaling}\n")
print(f"Confusion matrix:\n{test_cm_temp_scaling}\n")
print(f"Classification Report:\n{test_clf_report_temp_scaling}\n")

Accuracy: 0.4703493095044679

Confusion matrix:
tensor([[ 62.,  16.,   3.,   4.,   7.,  74.],
        [  8.,  57.,  17.,   4.,   0.,  35.],
        [  2.,  12.,  50.,   0.,   1.,  18.],
        [  5.,   0.,   0.,  65.,   0.,   5.],
        [  5.,   1.,   1.,   0.,  28., 148.],
        [ 89.,  99.,  39.,  35.,  24., 317.]], dtype=torch.float64)

Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.36      0.37       171
           1       0.47      0.31      0.37       185
           2       0.60      0.45      0.52       110
           3       0.87      0.60      0.71       108
           4       0.15      0.47      0.23        60
           5       0.53      0.53      0.53       597

    accuracy                           0.47      1231
   macro avg       0.50      0.45      0.45      1231
weighted avg       0.51      0.47      0.48      1231




## OPENMAX

In [151]:
lower_stack=nn.Sequential(
            nn.Linear(1792, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 64)
        )
# Initialize method
epsilon = torchosr.utils.base.get_openmax_epsilon(OTHER_LABEL)
epsilon=0.3
model = Openmax(lower_stack=lower_stack, n_known=OTHER_LABEL, epsilon=epsilon)
model.to(device)

writer = SummaryWriter()

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

results = []
best_result = 0
for epoch in tqdm(range(50)):
    model.train(train_dataloader, loss_fn, optimizer)

    # Test
    model.to("cpu")
    val_result = model.test(val_dataloader, loss_fn)
    result = val_result[-1].item()
    model.to(device)

    results.append(result)
    writer.add_scalar("OSR/OpenMax", result, epoch)

100%|██████████| 50/50 [01:42<00:00,  2.05s/it]


In [124]:
def predict_and_report_torchosr(model, dataloader):
    preds = []
    labels = []
    model.to("cpu")
    for data, label in dataloader:
        pred = model.predict(data)
        preds.extend(pred.tolist())
        labels.extend(label.tolist())

    true_labels = np.argmax(np.array((labels)), axis=1)
    print(f"Accuracy: {accuracy_score(true_labels,preds)}\n")
    print(f"Confusion matrix:\n{confusion_matrix(true_labels,preds)}\n")
    print(f"Classification Report:\n{classification_report(true_labels, preds)}\n")

    model.to(device)

In [137]:
predict_and_report_torchosr(model, test_dataloader)

Accuracy: 0.48172217709179527

Confusion matrix:
[[ 54  25   4   3   1  84]
 [ 14  76   9   2   0  84]
 [  4  32  30   2   0  42]
 [  6   4   0  60   0  38]
 [ 14   2   0   5   9  30]
 [ 89  57  24  20  43 364]]

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.32      0.31       171
           1       0.39      0.41      0.40       185
           2       0.45      0.27      0.34       110
           3       0.65      0.56      0.60       108
           4       0.17      0.15      0.16        60
           5       0.57      0.61      0.59       597

    accuracy                           0.48      1231
   macro avg       0.42      0.39      0.40      1231
weighted avg       0.48      0.48      0.48      1231




## T-SOFTMAX

In [138]:
lower_stack=nn.Sequential(
            nn.Linear(1792, 1024),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 64)
        )
# Initialize method
epsilon = torchosr.utils.base.get_softmax_epsilon(OTHER_LABEL)
model = TSoftmax(lower_stack=lower_stack, n_known=OTHER_LABEL, epsilon=epsilon)
model.to(device)

writer = SummaryWriter()

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

results = []
best_result = 0
for epoch in tqdm(range(50)):
    model.train(train_dataloader, loss_fn, optimizer)
    
    # Test
    model.to("cpu")
    result  = model.test(val_dataloader, loss_fn)[-1].item()
    model.to(device)

    results.append(result)
    writer.add_scalar("OSR/T-softmax", result, epoch)


100%|██████████| 50/50 [01:09<00:00,  1.38s/it]


In [150]:
predict_and_report_torchosr(model, test_dataloader)

Accuracy: 0.4281072298943948

Confusion matrix:
[[ 75  18   5   7   2  64]
 [ 20  76  14   4   0  71]
 [  3  24  51   1   0  31]
 [  6   5   0  66   0  31]
 [ 11   6   2   3  17  21]
 [127  86  38  24  80 242]]

Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.44      0.36       171
           1       0.35      0.41      0.38       185
           2       0.46      0.46      0.46       110
           3       0.63      0.61      0.62       108
           4       0.17      0.28      0.21        60
           5       0.53      0.41      0.46       597

    accuracy                           0.43      1231
   macro avg       0.41      0.44      0.42      1231
weighted avg       0.46      0.43      0.44      1231




In [149]:
# learn pytorch basic with some basic models and datasets
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
# https://pytorch.org/tutorials/beginner/basics/nnqs_tutorial.html