In [7]:
# IMPORTS

import os
%pip install moviepy
from moviepy.video.io.VideoFileClip import VideoFileClip
import librosa
import re
from collections import Counter, defaultdict
import torch
from torch.utils.data import Dataset

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\aryan\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [29]:
class IemocapDataset(Dataset):
    def __init__(self, iemocap_dataset_full_path, iemocap_audio_clips_folder_full_path, split, transform=None):
        self.IEMOCAP_MAIN_FOLDER = os.path.join(iemocap_dataset_full_path,"IEMOCAP_full_release")
        self.IEMOCAP_AUDIO_CLIPS = iemocap_audio_clips_folder_full_path
        self.TRANSCRIPTION_FOLDER = os.path.join("dialog", "transcriptions")
        self.VIDEO_FOLDER = os.path.join("dialog", "avi", "DivX")
        self.CATEGORICAL_LABELS_PATH = os.path.join("dialog", "EmoEvaluation", "Categorical")
        self.split = split
        self.transform = transform
        
        self.errors = defaultdict(int)
        self.newly_extracted_audio_files_count = 0
        self.dataset = self.create_dataset()
        self.print_summary()

    def extract_audio_slice(self, video_path, start_time, end_time, output_audio_path):
        audio_extraction_success_flag = True
        try:
            # Load the video clip
            video_clip = VideoFileClip(video_path).set_duration(VideoFileClip(video_path).duration)

            # Extract the audio between start_time and end_time
            audio_slice = video_clip.audio.subclip(start_time, end_time).set_end(end_time-start_time)

            # Write the audio slice to a new file
            audio_slice.write_audiofile(output_audio_path,verbose=False)

            # Close the video clip
            video_clip.close()
        
        except OSError:
            audio_extraction_success_flag = False
            
        else:
            pass

        finally:
            return audio_extraction_success_flag
        
    def get_evaluator_filenames_with_video_file_prefix(self, input_list, prefix_value):
        regex_pattern = re.compile(f'^{re.escape(prefix_value)}.*\.txt$')
        matching_strings = [s for s in input_list if regex_pattern.match(s)]
        return matching_strings
    
    def get_utterance_to_evaluationCounter_mapping_from_evaluation_files(self, evaluation_files):
        utterance_to_all_evaluations = {}

        for evaluation_file in evaluation_files:
            utterance_to_evaluationList = {}
            with open(evaluation_file,'r') as f:
                contents = f.read()
                utterance_evaluations = contents.split("\n")
                for evaluation in utterance_evaluations:
                    evaluation = evaluation.strip()
                    if(len(evaluation)==0):
                        continue
                    matches = re.findall(r':[^;]+;', evaluation)
                    matches = [match[1:-1] for match in matches]
                    utterance_to_evaluationList[evaluation.split()[0]] = matches
            
            # Combine lists from dict1
            for key, value_list in utterance_to_evaluationList.items():
                utterance_to_all_evaluations[key] = utterance_to_all_evaluations.get(key, []) + value_list

        utterance_to_evaluationsCounter = {k:Counter(v).most_common(1)[0][0] for k,v in utterance_to_all_evaluations.items()}
        return utterance_to_evaluationsCounter
    
    def create_dataset(self):
        dataset = []
        for session_num in range(1,6):
            for transcription_filename in os.listdir(os.path.join(self.IEMOCAP_MAIN_FOLDER,f"Session{session_num}", self.TRANSCRIPTION_FOLDER)):
                if(transcription_filename[0]!="."): 
                    filename_without_extension = transcription_filename.split(".")[0]
                    
                    categorical_labels_folder_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH)
                    evaluation_filenames = self.get_evaluator_filenames_with_video_file_prefix(os.listdir(categorical_labels_folder_full_path), filename_without_extension)
                    evaluation_files_full_paths_for_this_file = [os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.CATEGORICAL_LABELS_PATH, f) for f in evaluation_filenames]
                    evaluations_per_utterance = self.get_utterance_to_evaluationCounter_mapping_from_evaluation_files(evaluation_files_full_paths_for_this_file)
                    
                    transcription_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.TRANSCRIPTION_FOLDER, transcription_filename) 
                    with open(transcription_file_full_path,'r') as f:
                        contents = f.read()
                        lines = contents.split("\n")

                        # Iterate through utterances where every utterance looks like:
                        # Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.
                        for line in lines:

                            # Remove extra spaces and check if the line is not an empty link (usually at EOF)
                            line = line.strip()
                            if(len(line)==0):
                                break

                            # Remove idx of first space, ], -
                            try:
                                space_idx = line.index(" ")
                                timestampEndBracket_idx = line.index("]")
                                timestampHyphen_idx = line.index("-")
                            except:
                                self.errors["Problematic Transcription Line"]+=1
                                continue
                            else:
                                audio_output_filename = line[:space_idx]        # output audio file name = utterance name
                                start_time = float(line[space_idx+2:timestampHyphen_idx])-0.1        # start of timestamp for this utterance in the video
                                end_time = float(line[timestampHyphen_idx+1:timestampEndBracket_idx])      # end of timestamp for this utterance in the video
                                text = line[timestampEndBracket_idx+3:]         # the transcription of the utterance
                                evaluation = evaluations_per_utterance.get(audio_output_filename,"KEY_ERROR")
                                if(evaluation=="KEY_ERROR"):
                                    self.errors["Unavailable Label for an utterance"]+=1

                                video_filename = audio_output_filename[:line.rindex('_')]       # Only need Ses01F_impro01 from Ses01F_impro01_F000
                                video_file_full_path = os.path.join(self.IEMOCAP_MAIN_FOLDER, f"Session{session_num}", self.VIDEO_FOLDER, video_filename+".avi")         # name of the video file
                                
                                audio_output_file_full_path = os.path.join(self.IEMOCAP_AUDIO_CLIPS,audio_output_filename+".wav")
                                if(os.path.isfile(audio_output_file_full_path)==False):         # If a file already exists, dont waste time re-extracting audio
                                    audio_extraction_success_flag = self.extract_audio_slice(video_file_full_path, start_time, end_time, audio_output_file_full_path)
                                    if(audio_extraction_success_flag==False):                   # ERROR HANDLING => failure to extract audio -> skip
                                        self.errors["Problematic Audio Extraction"]+=1
                                    else:
                                        self.newly_extracted_audio_files_count+=1

                                if(evaluation!="KEY_ERROR" and (os.path.isfile(audio_output_file_full_path)==True or (os.path.isfile(audio_output_file_full_path)==False and audio_extraction_success_flag==True))):
                                    dataset.append((text,librosa.load(audio_output_file_full_path),evaluation))
        return dataset
    
    def print_summary(self):
        print("SUMMARY:\n")
        print(f"Extracted {self.newly_extracted_audio_files_count} audio files to directory {self.IEMOCAP_AUDIO_CLIPS}")
        for k,v in self.errors.items():
            print(f"{k}: {v}")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text, audio, label = self.dataset[idx]

        if self.transform:
            audio = self.transform(audio)
        return text, audio, label

In [None]:
# Read all files from D:\Projects\open-set-emotion-recognition\IEMOCAP_full_release\IEMOCAP_full_release\SessionX\dialog\transcriptions (Note the SessionX)

# In each of these files, you will get => Video file name, Timestamp, Text of each utterance

# Get the audio clip based on the video file name and timestamp. Save the audio file in a folder and return librosa audio, text

# To get emotion,  there are multiple evaluator files in D:\Projects\open-set-emotion-recognition\IEMOCAP_full_release\IEMOCAP_full_release\Session1\dialog\EmoEvaluation\Categorical
# for 1 video file. So read all the "TXT" files corresp to 1 particular video file and get the majority label

In [30]:
IEMOCAP_FULL_PATH = os.path.join(os.path.dirname(os.getcwd()),"IEMOCAP_full_release")
IEMOCAP_AUDIO_CLIPS =os.path.join(os.path.dirname(os.getcwd()),"audio_clips_iemocap")

iemocapDataset = IemocapDataset(iemocap_dataset_full_path=IEMOCAP_FULL_PATH,
                                iemocap_audio_clips_folder_full_path=IEMOCAP_AUDIO_CLIPS,
                                split=None,
                                transform=None)

SUMMARY:

Extracted 0 audio files to directory d:\Projects\open-set-emotion-recognition\audio_clips_iemocap
Problematic Transcription Line: 152
Unavailable Label for an utterance: 48


In [31]:
print(iemocapDataset[10])

('This form is a Z.X.four.', (array([ 1.04122525e-02,  1.57994684e-02,  1.45163713e-02, ...,
        5.34654071e-04,  1.43944699e-05, -3.16748337e-04], dtype=float32), 22050), 'Frustration')
