In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/conda-forge/miniforge/releases/download/23.1.0-1/Mambaforge-23.1.0-1-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:15
🔁 Restarting kernel...


In [None]:
!conda --version

conda 23.1.0


In [None]:
!which conda

/usr/local/bin/conda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, sys
import glob
import pickle
import numpy as np
import pandas as pd
import cv2
from scipy.io import wavfile
from tqdm import tqdm



def read_video(file_name):
    vidcap = cv2.VideoCapture(file_name)

    # Read FPS
    (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
    if int(major_ver)  < 3 :
        fps = vidcap.get(cv2.cv.CV_CAP_PROP_FPS)
    else :
        fps = vidcap.get(cv2.CAP_PROP_FPS)

    # Read image data
    success, image = vidcap.read()
    images = []
    while success:
        images.append(image)
        success, image = vidcap.read()
    return np.stack(images), fps

def parse_evaluation_transcript(eval_lines, transcript_lines):
    metadata = {}

    # Parse Evaluation
    for line in eval_lines:
        if line.startswith('['):
            tokens = line.strip().split('\t')
            time_tokens = tokens[0][1:-1].split(' ')
            start_time, end_time = float(time_tokens[0]), float(time_tokens[2])
            uttr_id, label = tokens[1], tokens[2]

            # Extract speaker information
            speaker_id = uttr_id.split("_")[0]

            metadata[uttr_id] = {'start_time': start_time, 'end_time': end_time, 'speaker_id': speaker_id}


    # Parse Transcript
    for line in transcript_lines:
        tokens = line.split(':')
        uttr_id = tokens[0].split(' ')[0]
        if '_' not in uttr_id:
            continue
        text = tokens[-1].strip()

        try:
            metadata[uttr_id]['text'] = text
        except KeyError:
            print(f'KeyError: {uttr_id}')
    print(metadata)
    return metadata


def retrieve_audio(signal, sr, start_time, end_time):
    start_idx = int(sr * start_time)
    end_idx = int(sr * end_time)
    audio_segment = signal[start_idx:end_idx]
    return audio_segment, sr

def retrieve_video(frames, fps, start_time, end_time):
    start_idx = int(fps * start_time)
    end_idx = int(fps * end_time)
    images = frames[start_idx:end_idx,:,:,:]
    return images, fps

def dump_image_audio(uttr_id, audio_segment, sr, img_segment, img_segment_L, img_segment_R, fps, out_path='./', grayscale=False):
    out_path = f'{out_path}/{"_".join(uttr_id.split("_")[:2])}'
    if not os.path.exists(f'./{out_path}/{uttr_id}'):
        os.makedirs(f'./{out_path}/{uttr_id}')
    wavfile.write(f'./{out_path}/{uttr_id}/audio.wav', sr, audio_segment)
    wavfile.write(f'./{out_path}/{uttr_id}/audio_L.wav', sr, audio_segment[:,0])
    wavfile.write(f'./{out_path}/{uttr_id}/audio_R.wav', sr, audio_segment[:,1])
    for i in range(img_segment.shape[0]):
#         cv2.imwrite(f'./{out_path}/{uttr_id}/image_{i}.jpg', img_segment[i,:,:,:])
        imgL = img_segment_L[i,:,:,:]
        imgR = img_segment_R[i,:,:,:]
        if grayscale:
            imgL = rgb2gray(imgL)
            imgR = rgb2gray(imgR)
        cv2.imwrite(f'./{out_path}/{uttr_id}/image_L_{i}.jpg', imgL)
        cv2.imwrite(f'./{out_path}/{uttr_id}/image_R_{i}.jpg', imgR)

def rgb2gray(rgb):
    r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
    return gray

def crop(imgs, target_size=224):
    # imgs.shape = (180, 480, 360, 3)
    _, h, w, _ = imgs.shape
    offset_h = (h - target_size) // 2
    offset_w = (w - target_size) // 2
    imgs = imgs[:, offset_h:-offset_h, offset_w:-offset_w, :]
    return imgs


# Process multimodal data over all sessions
# NOTE: This might take several hours to run, the time listed on this cell is for processing 5 label files
output_path =  '/content/drive/MyDrive/sample_folder_preprocess/IEMOCAP_PREPROCESS'


if not os.path.exists(output_path):
    os.makedirs(output_path)

all_metas = {}
for base_path in glob.glob('/content/drive/MyDrive/sample folder preprocess*'):
    avi_path = '/content/drive/MyDrive/sample folder preprocess/avi'
    script_path = '/content/drive/MyDrive/sample folder preprocess/transcriptions'
    wav_path = '/content/drive/MyDrive/sample folder preprocess/wav'
    label_path = '/content/drive/MyDrive/sample folder preprocess/Emo_evaluation'

    for eval_fname in tqdm(glob.glob(f'{label_path}/*.txt')):
        avi_fname = f'{avi_path}/{eval_fname.split("/")[-1].replace(".txt", ".avi")}'
        wav_fname = f'{wav_path}/{eval_fname.split("/")[-1].replace(".txt", ".wav")}'
        script_fname = f'{script_path}/{eval_fname.split("/")[-1]}'

        eval_lines = open(eval_fname).readlines()
        transcript_lines = open(script_fname).readlines()
        sr, signal  = wavfile.read(wav_fname)

        images, fps = read_video(avi_fname)

        # Retrieve uttr_id, label, time, and transcript
        metas = parse_evaluation_transcript(eval_lines, transcript_lines)

        for uttr_id, metadata in metas.items():
            # Retrieve and Store Audio
            audio_segment, sr = retrieve_audio(signal, sr, metadata['start_time'], metadata['end_time'])
            metadata['sr'] = sr

            img_segment, fps = retrieve_video(images, fps, metadata['start_time'], metadata['end_time'])
            img_segment_L, img_segment_R = img_segment[:,:,:img_segment.shape[2] // 2,:], img_segment[:,:,img_segment.shape[2] // 2:,:]
            img_segment_L = crop(img_segment_L)
            img_segment_R = crop(img_segment_R)
            metadata['fps'] = fps

            dump_image_audio(uttr_id, audio_segment, sr, img_segment, img_segment_L, img_segment_R, fps, out_path=output_path)

        # Update all metas
        all_metas.update(metas)
pickle.dump(all_metas, open(f'{output_path}/meta.pkl','wb'))

  0%|          | 0/1 [00:00<?, ?it/s]

{'Ses01F_impro01_F000': {'start_time': 6.2901, 'end_time': 8.2357, 'speaker_id': 'Ses01F', 'text': 'Excuse me.'}, 'Ses01F_impro01_F001': {'start_time': 10.01, 'end_time': 11.3925, 'speaker_id': 'Ses01F', 'text': 'Yeah.'}, 'Ses01F_impro01_F002': {'start_time': 14.8872, 'end_time': 18.0175, 'speaker_id': 'Ses01F', 'text': 'Is there a problem?'}, 'Ses01F_impro01_F003': {'start_time': 19.29, 'end_time': 20.7875, 'speaker_id': 'Ses01F', 'text': 'You did.'}, 'Ses01F_impro01_F004': {'start_time': 21.3257, 'end_time': 24.74, 'speaker_id': 'Ses01F', 'text': 'You were standing at the beginning and you directed me.'}, 'Ses01F_impro01_F005': {'start_time': 27.46, 'end_time': 31.49, 'speaker_id': 'Ses01F', 'text': "Well what's the problem?  Let me change it."}, 'Ses01F_impro01_F006': {'start_time': 38.965, 'end_time': 43.59, 'speaker_id': 'Ses01F', 'text': "What?  I'm getting an ID.  This is why I'm here.  My wallet was stolen."}, 'Ses01F_impro01_F007': {'start_time': 46.58, 'end_time': 52.19, 'spe

100%|██████████| 1/1 [00:26<00:00, 26.05s/it]


In [None]:
!pip install torch torchaudio torchvision transformers  facenet-pytorch

Collecting torch
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.0.2-cp310-cp310-manylinux1_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting facenet-pytorch
  Downloading facenet_pytorch-2.5.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!git clone https://github.com/facebookresearch/SparseConvNet

Cloning into 'SparseConvNet'...
remote: Enumerating objects: 1928, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 1928 (delta 23), reused 25 (delta 10), pack-reused 1880[K
Receiving objects: 100% (1928/1928), 913.46 KiB | 10.26 MiB/s, done.
Resolving deltas: 100% (1372/1372), done.


In [None]:
%cd SparseConvNet/

/content/SparseConvNet


In [None]:
!bash develop.sh

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
running develop
running egg_info
creating sparseconvnet.egg-info
writing sparseconvnet.egg-info/PKG-INFO
writing dependency_links to sparseconvnet.egg-info/dependency_links.txt
writing top-level names to sparseconvnet.egg-info/top_level.txt
writing manifest file 'sparseconvnet.egg-info/SOURCES.txt'
reading manifest file 'sparseconvnet.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'sparseconvnet.egg-info/SOURCES.txt'
running build_ext
building 'sparseconvnet.SCN' extension
creating build
creating build/temp.linux-x86_64-cpython-310
creating build/temp.linux-x86_64-cpython-310/sparseconvnet
creating build/temp.linux-x86_64-cpython-310/sparseconvnet/SCN
gcc -pthread -B /usr/local/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /usr/local/include -fPIC -O2 -isystem /usr/local/include -fPIC -I/content/SparseConvNet/sparseconvnet/SCN/ -I/usr/local/lib/python

In [None]:
%cd /content/drive/MyDrive/Multimodal-End2end-Sparse/


/content/drive/MyDrive/Multimodal-End2end-Sparse


In [None]:
!pip install tabulate
!pip install scikit-learn


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
[0mCollecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.3.1-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.5.0
  Downloading scipy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, j

In [None]:
!python /content/drive/MyDrive/Multimodal-End2end-Sparse/main.py

usage: main.py
       [-h]
       -bs
       BATCH_SIZE
       -lr
       LEARNING_RATE
       [-wd WEIGHT_DECAY]
       -ep
       EPOCHS
       [-es EARLY_STOP]
       [-cu CUDA]
       [-cl CLIP]
       [-sc]
       [-se SEED]
       [--loss LOSS]
       [--optim OPTIM]
       [--text-lr-factor TEXT_LR_FACTOR]
       [-mo MODEL]
       [--text-model-size TEXT_MODEL_SIZE]
       [--fusion FUSION]
       [--feature-dim FEATURE_DIM]
       [-st SPARSE_THRESHOLD]
       [-hfcs HFC_SIZES [HFC_SIZES ...]]
       [--trans-dim TRANS_DIM]
       [--trans-nlayers TRANS_NLAYERS]
       [--trans-nheads TRANS_NHEADS]
       [-aft AUDIO_FEATURE_TYPE]
       [--num-emotions NUM_EMOTIONS]
       [--img-interval IMG_INTERVAL]
       [--hand-crafted]
       [--text-max-len TEXT_MAX_LEN]
       [--datapath DATAPATH]
       [--dataset DATASET]
       [-mod MODALITIES]
       [--valid]
       [--test]
       [--ckpt CKPT]
       [--ckpt-mod CKPT_MOD]
       [-dr DROPOUT]
       [-nl NUM_LAYERS]
       [

In [None]:
!python main.py -lr=5e-5 -ep=40 -mod=tav -bs=8 --img-interval=500 --early-stop=6 --loss=bce --cuda=3 --model=mme2e --num-persons=10 --trans-dim=64 --trans-nlayers=4 --trans-nheads=4 --text-lr-factor=10 --text-model-size=base --text-max-len=100

Start loading the data....
Traceback (most recent call last):
  File "/content/drive/MyDrive/Multimodal-End2end-Sparse/main.py", line 36, in <module>
    train_dataset = get_dataset_iemocap(data_folder=args['datapath'], phase='train',
  File "/content/drive/MyDrive/Multimodal-End2end-Sparse/src/datasets.py", line 25, in get_dataset_iemocap
    texts = [meta[uttr_id]['text'] for uttr_id in uttr_ids]
  File "/content/drive/MyDrive/Multimodal-End2end-Sparse/src/datasets.py", line 25, in <listcomp>
    texts = [meta[uttr_id]['text'] for uttr_id in uttr_ids]
KeyError: 'Ses04F_script01_1_F024'
