In [4]:
%pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.2.0
  Downloading https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp38-cp38-win_amd64.whl (2704.3 MB)
     ---------------------------------------- 2.7/2.7 GB 524.4 kB/s eta 0:00:00
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\aryan\\AppData\\Local\\Temp\\pip-req-tracker-m952_l8b\\a83478dd290d62d39538920c63624021a7eac9b6fabc8ef5c6de234f'

You should consider upgrading via the 'c:\Users\aryan\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [1]:
import torch
print(torch.cuda.is_available())

True


In [4]:
# IMPORTS

import os
# %pip install moviepy
from moviepy.video.io.VideoFileClip import VideoFileClip
import librosa
import librosa.display
import re
from collections import Counter, defaultdict
import torch
from torch.utils.data import Dataset
from torchvision import models, transforms
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import os
import time
import numpy as np
from tqdm import tqdm
from PIL import Image
# %pip install -U sentence-transformers==2.5.1
from sentence_transformers import SentenceTransformer

Using device: cuda


In [5]:
class IemocapDataset(Dataset):
    def __init__(self, iemocap_dataset_full_path, iemocap_spectrogram_dir, iemocap_log_spectrogram_dir, is_closed_label_set_flag, split, transform=None):
        self.IEMOCAP_MAIN_FOLDER = os.path.join(iemocap_dataset_full_path,"IEMOCAP_full_release")
        self.TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
        self.AUDIO_FOLDER = os.path.join("sentences", "wav")
        self.CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
        self.split = split
        self.transform = transform
        self.is_closed_label_set_flag = is_closed_label_set_flag
        self.iemocap_spectrogram_dir = iemocap_spectrogram_dir
        self.iemocap_log_spectrogram_dir = iemocap_log_spectrogram_dir
        
        self.errors = defaultdict(int)
        self.unique_labels = []
        self.audio_files = []
        self.sentences_list = []
        self.dataset = self.create_dataset()
        self.labels_to_int = {label:i for i,label in enumerate(self.unique_labels)}

        self.sbert = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device=device)
        self.sentence_embeddings = self.sbert.encode(self.sentences_list, convert_to_tensor=True, show_progress_bar=True, batch_size=128, device=device)
        
        self.create_spectrograms(self.iemocap_spectrogram_dir)
        self.create_log_spectrograms(self.iemocap_log_spectrogram_dir)
        self.print_summary()

        self.resnet_model = models.resnet50(pretrained=True)
        self.feature_extractor = torch.nn.Sequential(*list(self.resnet_model.children())[:-1]).to(device)
        self.feature_extractor.eval()
        
    def get_evaluator_filenames_with_video_file_prefix(self, input_list, prefix_value):
        regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
        matching_strings = [s for s in input_list if regex_pattern.match(s)]
        return matching_strings
    
    def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(self, evaluation_files):
        utterance_to_all_evaluations = {}

        for evaluation_file in evaluation_files:
            utterance_to_evaluationList = {}
            with open(evaluation_file,'r') as f:
                contents = f.read()
                utterance_evaluations = contents.split("\n")
                for evaluation in utterance_evaluations:
                    evaluation = evaluation.strip()
                    if(len(evaluation)==0):
                        continue
                    matches = re.findall(r':[^;]+;', evaluation)
                    matches = [match[1:-1] for match in matches]
                    utterance_to_evaluationList[evaluation.split()[0]] = matches
            
            # Combine lists from dict1
            for key, value_list in utterance_to_evaluationList.items():
                utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

        utterance_to_evaluationsCounter = {k:Counter(v).most_common(1)[0][0] for k,v in utterance_to_all_evaluations.items()}
        return utterance_to_evaluationsCounter
    
    def is_label_a_closed_label(self,evaluation):
        return evaluation in ["Frustration","Excited","Neutral state","Anger","Sadness","Happiness"]
    
    def create_dataset(self):
        dataset = []
        for session_num in range(1,6):
            for transcription_filename in os.listdir(os.path.join(self.IEMOCAP_MAIN_FOLDER,f"Session{session_num}", self.TRANSCRIPTION_FOLDER)):
                if(transcription_filename[0]!="."): 

                    filename_without_extension = transcription_filename.split(".")[0]
                    
                    categorical_labels_folder_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH)
                    evaluation_filenames = self.get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                    evaluation_files_full_paths_for_this_file = [os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                    evaluations_per_utterance = self.get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)
                    
                    transcription_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.TRANSCRIPTION_FOLDER, transcription_filename) 
                    with open(transcription_file_full_path,'r') as f:
                        contents = f.read()
                        lines = contents.split("\n")

                        # Iterate through utterances where every utterance looks like:
                        # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                        for line in lines:

                            # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                            line = line.strip()
                            if(len(line)==0):
                                break

                            # Remove idx of first space, ], -
                            try:
                                space_idx = line.index(" ")
                                timestampEndBracket_idx = line.index("]")
                                timestampHyphen_idx = line.index("-")
                            except:
                                self.errors["Problematic Transcription Line"]+=1
                                continue
                            else:
                                audio_filename = line[:space_idx]        # output audio file name = utterance name
                                text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                                evaluation = evaluations_per_utterance.get(audio_filename,"KEY_ERROR")
                                if(evaluation=="KEY_ERROR"):
                                    self.errors["Unavailable Label for an utterance"]+=1

                                utterance_audios_per_video_folder = audio_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                                audio_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.AUDIO_FOLDER, utterance_audios_per_video_folder, audio_filename+".wav")         # name of the video file

                                # if(evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True and self.is_label_a_closed_label(evaluation)==self.is_closed_label_set_flag):
                                if(evaluation!="KEY_ERROR" and os.path.isfile(audio_file_full_path)==True):    
                                    self.audio_files.append(audio_file_full_path)
                                    self.sentences_list.append(text)
                                    dataset.append((text,audio_file_full_path,evaluation))
                                    if evaluation not in self.unique_labels:
                                        self.unique_labels.append(evaluation)
        return dataset
    
    def print_summary(self):
        print("SUMMARY:\n")
        for k,v in self.errors.items():
            print(f"{k}: {v}")
    
    def create_spectrograms(self,iemocap_spectrogram_dir):
        log_dir = os.path.join(os.path.dirname(os.getcwd()),'iemocap','log_dir')
        output_dir = os.path.join(os.path.dirname(os.getcwd()),iemocap_spectrogram_dir)
        log_file_path = os.path.join(log_dir,'processed_files_spectrogram.log')
        error_log_path = os.path.join(log_dir,'error_files_spectrogram.log')

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        processed_files = set()
        if os.path.exists(log_file_path):
            with open(log_file_path, 'r') as file:
                processed_files = set(file.read().splitlines())

        processed_files_count = 0
        throttle_delay = 1 
        def create_spectrogram(filename, audio_file_path, output_file_path):
            y, sr = librosa.load(audio_file_path)
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
            S_dB = librosa.power_to_db(S, ref=np.max)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(S_dB, sr=sr, fmax=8000)
            plt.tight_layout()
            plt.savefig(output_file_path)
            plt.close()

        for filenum in tqdm(range(len(self.audio_files))):
            filename = self.audio_files[filenum]
            if filename.endswith(".wav") and filename not in processed_files:

                audio_file_path = os.path.join(filename)
                output_file_path = os.path.join(output_dir, os.path.splitext(os.path.basename(filename))[0])
                try:
                    create_spectrogram(filename, audio_file_path, output_file_path)
                    processed_files.add(filename)
                    processed_files_count += 1
                    with open(log_file_path, 'a') as log_file:
                        log_file.write(f"{filename}\n")
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
                    with open(error_log_path, 'a') as error_log:
                        error_log.write(f"{filename}: {e}\n")
                finally:
                    time.sleep(throttle_delay)

        print(f"Batch conversion completed for spectrograms. Processed {processed_files_count} files.")

    
    def create_log_spectrograms(self,iemocap_log_spectrogram_dir):
        def log_specgram(audio, sample_rate, window_size=20,
                        step_size=10, eps=1e-10):
            nperseg = int(round(window_size * sample_rate / 1e3))
            noverlap = int(round(step_size * sample_rate / 1e3))
            freqs, times, spec = signal.spectrogram(audio, fs=sample_rate, window='hann', 
                                                    nperseg=nperseg, noverlap=noverlap, detrend=False)
            return freqs, np.log(spec.T.astype(np.float32) + eps)

        def process_audio_file(filepath, output_dir):
            sample_rate, audio = wavfile.read(filepath)
            if audio.ndim > 1:
                audio = audio.mean(axis=1)
            _, spectrogram = log_specgram(audio, sample_rate)
            plt.figure(figsize=(10, 4))  
            plt.xticks([])
            plt.yticks([])
            plt.imshow(spectrogram.T, aspect='auto', origin='lower')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, os.path.splitext(os.path.basename(filepath))[0]+".png"))
            plt.close()  

        log_dir = os.path.join(os.path.dirname(os.getcwd()),'iemocap','log_dir')
        output_dir = os.path.join(os.path.dirname(os.getcwd()),iemocap_log_spectrogram_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        log_file_path = os.path.join(log_dir, 'processed_files_log_spectrogram.log')
        error_log_path = os.path.join(log_dir, 'error_files_log_spectrogram.log')

        throttle_delay = 1 

        processed_files = set()
        if os.path.exists(log_file_path):
            with open(log_file_path, 'r') as file:
                processed_files = set(file.read().splitlines())

        processed_files_count = 0
        for filenum in tqdm(range(len(self.audio_files))):
            filepath = self.audio_files[filenum]
            if filepath.endswith(".wav") and filepath not in processed_files:
                try:
                    process_audio_file(filepath, output_dir)
                    processed_files.add(filepath)
                    processed_files_count += 1
                    with open(log_file_path, 'a') as log_file:
                        log_file.write(f"{filepath}\n")
                except Exception as e:
                    print(f"Error processing {filepath}: {e}")
                    with open(error_log_path, 'a') as error_log:
                        error_log.write(f"{filepath}: {e}\n")
                finally:
                    time.sleep(throttle_delay)

        print(f"Batch conversion completed for log spectrograms. Processed {processed_files_count} files.")

    def preprocess_img(self, img):
        preprocessor = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
        ])
        img_t =  preprocessor(img).to(device)
        return img_t

    def extract_audio_features_from_spectrogram(self, img):
        # Pass the input through the model
        with torch.no_grad():
            output = self.feature_extractor(img)
        return output
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        _, audio, label = self.dataset[idx]

        text = self.sentence_embeddings[idx]

        spectrogram_data = Image.open(os.path.join(os.path.dirname(os.getcwd()),self.iemocap_spectrogram_dir,os.path.splitext(os.path.basename(audio))[0]+".png"))
        spectrogram_data = self.preprocess_img(spectrogram_data)
        spectrogram_data = spectrogram_data[0:3, :, :]
        spectrogram_data = spectrogram_data.unsqueeze(0)
        spectrogram_data = self.extract_audio_features_from_spectrogram(spectrogram_data)
        spectrogram_data = spectrogram_data.view(-1, 2048)[0]

        label = self.labels_to_int[label]

        if self.transform:
            audio = self.transform(audio)
        return text, spectrogram_data, label

In [6]:
# Read all files from D:\Projects\open-set-emotion-recognition\IEMOCAP_full_release\IEMOCAP_full_release\SessionX\dialog\transcriptions (Note the SessionX)

# In each of these files, you will get => Video file name, Timestamp, Text of each utterance

# Get the audio clip based on the video file name and timestamp. Save the audio file in a folder and return librosa audio, text

# To get emotion,  there are multiple evaluator files in D:\Projects\open-set-emotion-recognition\IEMOCAP_full_release\IEMOCAP_full_release\Session1\dialog\EmoEvaluation\Categorical
# for 1 video file. So read all the "TXT" files corresp to 1 particular video file and get the majority label

In [7]:
IEMOCAP_FULL_PATH = os.path.join(os.path.dirname(os.getcwd()),"IEMOCAP_full_release")

iemocapDataset = IemocapDataset(iemocap_dataset_full_path=IEMOCAP_FULL_PATH,
                                iemocap_spectrogram_dir=os.path.join("iemocap","spectrogram"),
                                iemocap_log_spectrogram_dir=os.path.join("iemocap","log_spectrogram"),
                                is_closed_label_set_flag=True,
                                split=None,
                                transform=None)

Batches: 100%|██████████| 79/79 [00:15<00:00,  5.15it/s]
100%|██████████| 10039/10039 [00:00<00:00, 2515479.89it/s]


Batch conversion completed for spectrograms. Processed 0 files.


100%|██████████| 10039/10039 [00:00<00:00, 3356446.22it/s]


Batch conversion completed for log spectrograms. Processed 0 files.
SUMMARY:

Problematic Transcription Line: 152
Unavailable Label for an utterance: 48


In [8]:
print(len(iemocapDataset))

10039


In [9]:
print(iemocapDataset[0][0].shape, iemocapDataset[0][1].shape, iemocapDataset[0][2])

torch.Size([768]) torch.Size([2048]) 0


In [10]:
print(iemocapDataset.unique_labels)

['Neutral state', 'Frustration', 'Anger', 'Sadness', 'Happiness', 'Surprise', 'Excited', 'Fear', 'Other', 'Disgust']


In [None]:
# closed_iemo = CLosedIEMO()
# ope iemo = OPenIEMO()

# train_dataset, test_dataset = torch.utils.data.random_split(cloed itemo, [0.8%, 0.2%])

# val_dataset, test_dataset = torch.utils.data.random_split(test dataset, [0.5, 0.5])

# val_dataset2, test_dataset2 = torch.utils.data.random_split(open iemo, [0.5, 0.5])

# val = val_dataset + val_dataset2
# test = test_dataset + test_dataset2


