# CMU Mutlimodalsdk - Exploration dataset and Baseline

In [3]:
import os
import re
from typing import Optional

# path to the SDK folder
SDK_PATH: Optional[str] = None
# path to the folder where you want to store data
DATA_PATH: Optional[str] = '.src/datasets/CMU-MultimodalSDK/data/'
# path to a pretrained word embedding file
WORD_EMB_PATH: Optional[str] = None
# path to loaded word embedding matrix and corresponding word2id mapping
CACHE_PATH: Optional[str] = '.src/datasets/CMU-MultimodalSDK/data/embedding_and_mapping.pt'
!ls

LICENSE       __init_.py    librerias.txt next_steps.md [34mwandb[m[m
LICENSE.txt   [31mclean.sh[m[m      [34mmmsdk[m[m         optim.std
README.md     [34mexamples[m[m      model.std     [34mrelated_repos[m[m


In [2]:
# path to the folder where you want to store data
os.chdir('./src/datasets/CMU-MultimodalSDK')

In [6]:
!ls
import sys
import mmsdk
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

LICENSE       __init_.py    librerias.txt next_steps.md [34mwandb[m[m
LICENSE.txt   [31mclean.sh[m[m      [34mmmsdk[m[m         optim.std
README.md     [34mexamples[m[m      model.std     [34mrelated_repos[m[m


In [4]:
if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

# create folders for storing the data
if not os.path.exists(DATA_PATH):
    check_call(' '.join(['mkdir', '-p', DATA_PATH]), shell=True)

LICENSE       README.md     [31mclean.sh[m[m      [34mmmsdk[m[m         [34mrelated_repos[m[m
LICENSE.txt   __init_.py    [34mexamples[m[m      next_steps.md


SDK path is not specified! Please specify first in constants/paths.py


: 

## Download data (Only neccesary to do it one time)

In [4]:
# MOSI DATASET
DATASET = md.cmu_mosi

try:
    md.mmdataset(DATASET.highlevel, DATA_PATH)
except RuntimeError:
    print("High-level features have been downloaded previously.")

try:
    md.mmdataset(DATASET.raw, DATA_PATH)
except RuntimeError:
    print("Raw data have been downloaded previously.")

try:
    md.mmdataset(DATASET.labels, DATA_PATH)
except RuntimeError:
    print("Labels have been downloaded previously.")

!wget https://github.com/Justin1904/CMU-MultimodalSDK-Tutorials/blob/master/data/CMU_MOSI_ModifiedTimestampedWords.csd -O .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_ModifiedTimestampedWords.csd
data_files = os.listdir(DATA_PATH)

print("Downloaded data: ",'\n'.join(data_files))


[94m[1m[2023-09-22 09:51:16.977] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/language/CMU_MOSI_TimestampedWordVectors.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedWordVectors.csd...


                                                                     

[92m[1m[2023-09-22 09:52:32.837] | Success | [0mDownload complete!
[92m[1m[2023-09-22 09:52:32.846] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedWordVectors.csd ...
[94m[1m[2023-09-22 09:52:32.863] | Status  | [0mChecking the integrity of the <glove_vectors> computational sequence ...
[94m[1m[2023-09-22 09:52:32.863] | Status  | [0mChecking the format of the data in <glove_vectors> computational sequence ...


                                                                   

[92m[1m[2023-09-22 09:52:32.891] | Success | [0m<glove_vectors> computational sequence data in correct format.
[94m[1m[2023-09-22 09:52:32.891] | Status  | [0mChecking the format of the metadata in <glove_vectors> computational sequence ...
[94m[1m[2023-09-22 09:52:33.138] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/visual/CMU_MOSI_Visual_Facet_41.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_Facet_41.csd...


                                                                     

[92m[1m[2023-09-22 09:53:56.671] | Success | [0mDownload complete!
[92m[1m[2023-09-22 09:53:56.675] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_Facet_41.csd ...
[94m[1m[2023-09-22 09:53:56.692] | Status  | [0mChecking the integrity of the <FACET_4.1> computational sequence ...
[94m[1m[2023-09-22 09:53:56.692] | Status  | [0mChecking the format of the data in <FACET_4.1> computational sequence ...


                                                                   

[92m[1m[2023-09-22 09:53:56.726] | Success | [0m<FACET_4.1> computational sequence data in correct format.
[94m[1m[2023-09-22 09:53:56.726] | Status  | [0mChecking the format of the metadata in <FACET_4.1> computational sequence ...
[94m[1m[2023-09-22 09:53:56.987] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/visual/CMU_MOSI_Visual_Facet_42.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_Facet_42.csd...


                                                                     

[92m[1m[2023-09-22 09:55:02.927] | Success | [0mDownload complete!
[92m[1m[2023-09-22 09:55:02.929] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_Facet_42.csd ...
[94m[1m[2023-09-22 09:55:02.941] | Status  | [0mChecking the integrity of the <FACET_4.2> computational sequence ...
[94m[1m[2023-09-22 09:55:02.941] | Status  | [0mChecking the format of the data in <FACET_4.2> computational sequence ...


                                                                   

[92m[1m[2023-09-22 09:55:02.974] | Success | [0m<FACET_4.2> computational sequence data in correct format.
[94m[1m[2023-09-22 09:55:02.974] | Status  | [0mChecking the format of the metadata in <FACET_4.2> computational sequence ...
[94m[1m[2023-09-22 09:55:03.239] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/acoustic/CMU_MOSI_OpenSmile_EB10.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_OpenSmile_EB10.csd...


                                                                    

[92m[1m[2023-09-22 09:55:19.269] | Success | [0mDownload complete!
[92m[1m[2023-09-22 09:55:19.270] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_OpenSmile_EB10.csd ...
[94m[1m[2023-09-22 09:55:19.274] | Status  | [0mChecking the integrity of the <OpenSmile_emobase2010> computational sequence ...
[94m[1m[2023-09-22 09:55:19.274] | Status  | [0mChecking the format of the data in <OpenSmile_emobase2010> computational sequence ...


                                                                   

[92m[1m[2023-09-22 09:55:19.295] | Success | [0m<OpenSmile_emobase2010> computational sequence data in correct format.
[94m[1m[2023-09-22 09:55:19.295] | Status  | [0mChecking the format of the metadata in <OpenSmile_emobase2010> computational sequence ...
[94m[1m[2023-09-22 09:55:19.551] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/acoustic/CMU_MOSI_openSMILE_IS09.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_openSMILE_IS09.csd...


                                                                        

[92m[1m[2023-09-22 10:05:27.038] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:05:27.040] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_openSMILE_IS09.csd ...
[94m[1m[2023-09-22 10:05:27.052] | Status  | [0mChecking the integrity of the <b'OpenSMILE'> computational sequence ...
[94m[1m[2023-09-22 10:05:27.052] | Status  | [0mChecking the format of the data in <b'OpenSMILE'> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:05:27.072] | Success | [0m<b'OpenSMILE'> computational sequence data in correct format.
[94m[1m[2023-09-22 10:05:27.072] | Status  | [0mChecking the format of the metadata in <b'OpenSMILE'> computational sequence ...
[94m[1m[2023-09-22 10:05:27.340] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/visual/CMU_MOSI_Visual_OpenFace_1.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_OpenFace_1.csd...


                                                                       

[92m[1m[2023-09-22 10:17:06.541] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:17:06.545] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_OpenFace_1.csd ...
[94m[1m[2023-09-22 10:17:06.561] | Status  | [0mChecking the integrity of the <OpenFace_1> computational sequence ...
[94m[1m[2023-09-22 10:17:06.561] | Status  | [0mChecking the format of the data in <OpenFace_1> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:17:06.596] | Success | [0m<OpenFace_1> computational sequence data in correct format.
[94m[1m[2023-09-22 10:17:06.596] | Status  | [0mChecking the format of the metadata in <OpenFace_1> computational sequence ...
[94m[1m[2023-09-22 10:17:06.977] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/visual/CMU_MOSI_Visual_OpenFace_2.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_OpenFace_2.csd...


                                                                       

[92m[1m[2023-09-22 10:29:44.959] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:29:44.974] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_OpenFace_2.csd ...
[94m[1m[2023-09-22 10:29:44.991] | Status  | [0mChecking the integrity of the <OpenFace_2> computational sequence ...
[94m[1m[2023-09-22 10:29:44.991] | Status  | [0mChecking the format of the data in <OpenFace_2> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:29:45.011] | Success | [0m<OpenFace_2> computational sequence data in correct format.
[94m[1m[2023-09-22 10:29:45.011] | Status  | [0mChecking the format of the metadata in <OpenFace_2> computational sequence ...
[94m[1m[2023-09-22 10:29:45.266] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/acoustic/CMU_MOSI_COVAREP.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_COVAREP.csd...


                                                                       

[92m[1m[2023-09-22 10:36:35.141] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:36:35.145] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_COVAREP.csd ...
[94m[1m[2023-09-22 10:36:35.159] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2023-09-22 10:36:35.159] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:36:35.194] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2023-09-22 10:36:35.194] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2023-09-22 10:36:35.194] | Success | [0mDataset initialized successfully ... 
[94m[1m[2023-09-22 10:36:35.457] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/language/CMU_MOSI_TimestampedWords.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedWords.csd...


                                                                 

[92m[1m[2023-09-22 10:36:44.206] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:36:44.207] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedWords.csd ...
[94m[1m[2023-09-22 10:36:44.212] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2023-09-22 10:36:44.212] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:36:44.241] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2023-09-22 10:36:44.241] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[94m[1m[2023-09-22 10:36:44.530] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/language/CMU_MOSI_TimestampedPhones.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedPhones.csd...


                                                                   

[92m[1m[2023-09-22 10:36:47.505] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:36:47.506] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_TimestampedPhones.csd ...
[94m[1m[2023-09-22 10:36:47.513] | Status  | [0mChecking the integrity of the <phoneme> computational sequence ...
[94m[1m[2023-09-22 10:36:47.513] | Status  | [0mChecking the format of the data in <phoneme> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:36:47.542] | Success | [0m<phoneme> computational sequence data in correct format.
[94m[1m[2023-09-22 10:36:47.542] | Status  | [0mChecking the format of the metadata in <phoneme> computational sequence ...
[92m[1m[2023-09-22 10:36:47.542] | Success | [0mDataset initialized successfully ... 
[94m[1m[2023-09-22 10:36:47.802] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/labels/CMU_MOSI_Opinion_Labels.csd to .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Opinion_Labels.csd...


                                                                

[92m[1m[2023-09-22 10:36:48.798] | Success | [0mDownload complete!
[92m[1m[2023-09-22 10:36:48.799] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2023-09-22 10:36:48.805] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2023-09-22 10:36:48.805] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2023-09-22 10:36:48.833] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2023-09-22 10:36:48.833] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[92m[1m[2023-09-22 10:36:48.833] | Success | [0mDataset initialized successfully ... 




In [7]:
data_files = os.listdir(DATA_PATH)

print("Downloaded data: ",'\n'.join(data_files))

Downloaded data:  CMU_MOSI_ModifiedTimestampedWords.csd
CMU_MOSI_OpenSmile_EB10.csd
CMU_MOSI_openSMILE_IS09.csd
CMU_MOSI_Opinion_Labels.csd
CMU_MOSI_TimestampedWords.csd
CMU_MOSI_Visual_OpenFace_2.csd
CMU_MOSI_TimestampedWordVectors.csd
CMU_MOSI_Visual_OpenFace_1.csd
CMU_MOSI_Visual_Facet_41.csd
CMU_MOSI_TimestampedPhones.csd
CMU_MOSI_Visual_Facet_42.csd
CMU_MOSI_COVAREP.csd


## Aligned modalities

In [8]:
visual_field = 'CMU_MOSI_Visual_Facet_41'
acoustic_field = 'CMU_MOSI_COVAREP'
#text_field = 'CMU_MOSI_TimestampedWords'
text_field = 'CMU_MOSI_ModifiedTimestampedWords'


features = [
    text_field,
    visual_field,
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2023-09-24 17:58:56.575] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_ModifiedTimestampedWords.csd ...
[94m[1m[2023-09-24 17:58:56.587] | Status  | [0mChecking the integrity of the <b'CMU_MOSI_ModifiedTimestampedWords'> computational sequence ...
[94m[1m[2023-09-24 17:58:56.587] | Status  | [0mChecking the format of the data in <b'CMU_MOSI_ModifiedTimestampedWords'> computational sequence ...


                                                                   

[92m[1m[2023-09-24 17:58:56.621] | Success | [0m<b'CMU_MOSI_ModifiedTimestampedWords'> computational sequence data in correct format.
[94m[1m[2023-09-24 17:58:56.621] | Status  | [0mChecking the format of the metadata in <b'CMU_MOSI_ModifiedTimestampedWords'> computational sequence ...
[92m[1m[2023-09-24 17:58:56.622] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Visual_Facet_41.csd ...
[94m[1m[2023-09-24 17:58:56.634] | Status  | [0mChecking the integrity of the <FACET_4.1> computational sequence ...
[94m[1m[2023-09-24 17:58:56.634] | Status  | [0mChecking the format of the data in <FACET_4.1> computational sequence ...


                                                                   

[92m[1m[2023-09-24 17:58:56.662] | Success | [0m<FACET_4.1> computational sequence data in correct format.
[94m[1m[2023-09-24 17:58:56.662] | Status  | [0mChecking the format of the metadata in <FACET_4.1> computational sequence ...
[92m[1m[2023-09-24 17:58:56.663] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_COVAREP.csd ...
[94m[1m[2023-09-24 17:58:56.672] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2023-09-24 17:58:56.672] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                   

[92m[1m[2023-09-24 17:58:56.695] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2023-09-24 17:58:56.695] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2023-09-24 17:58:56.695] | Success | [0mDataset initialized successfully ... 




In [9]:
print("=" * 80)
print(f"Dataset keys: {list(dataset.keys())}")
print("=" * 80)

print(f"IDs: {list(dataset[visual_field].keys())[:5]}")
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(f"Into IDs: {list(dataset[visual_field][some_id].keys())}")
print(f"Shape: {list(dataset[visual_field][some_id]['intervals'].shape)}")
print("=" * 80)

print(f"Shape visual features: {list(dataset[visual_field][some_id]['features'].shape)}")
print(f"Shape text features: {list(dataset[text_field][some_id]['features'].shape)}")
print(f"Shape acoustic features: {list(dataset[acoustic_field][some_id]['features'].shape)}")
print("Different modalities have different number of time steps!")

Dataset keys: ['CMU_MOSI_ModifiedTimestampedWords', 'CMU_MOSI_Visual_Facet_41', 'CMU_MOSI_COVAREP']
IDs: ['03bSnISJMiM', '0h-zjBukYpk', '1DmNV9C1hbY', '1iG0909rllw', '2WGyTLYerpo']
Into IDs: ['features', 'intervals']
Shape: [5404, 2]
Shape visual features: [5404, 47]
Shape text features: [658, 1]
Shape acoustic features: [18009, 74]
Different modalities have different number of time steps!


In [10]:
import numpy as np
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features
dataset.align(text_field, collapse_functions=[avg])


[94m[1m[2023-09-24 17:59:03.319] | Status  | [0mUnify was called ...
[92m[1m[2023-09-24 17:59:03.319] | Success | [0mUnify completed ...
[94m[1m[2023-09-24 17:59:03.319] | Status  | [0mPre-alignment based on <CMU_MOSI_ModifiedTimestampedWords> computational sequence started ...
[94m[1m[2023-09-24 17:59:07.908] | Status  | [0mPre-alignment done for <CMU_MOSI_COVAREP> ...
[94m[1m[2023-09-24 17:59:08.852] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_41> ...
[94m[1m[2023-09-24 17:59:08.872] | Status  | [0mAlignment starting ...


                                                                                              

[92m[1m[2023-09-24 17:59:34.413] | Success | [0mAlignment to <CMU_MOSI_ModifiedTimestampedWords> complete.
[94m[1m[2023-09-24 17:59:34.413] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2023-09-24 17:59:34.425] | Success | [0mInitialized empty <CMU_MOSI_ModifiedTimestampedWords> computational sequence.
[94m[1m[2023-09-24 17:59:34.425] | Status  | [0mChecking the format of the data in <CMU_MOSI_ModifiedTimestampedWords> computational sequence ...


                                                                      

[92m[1m[2023-09-24 17:59:34.476] | Success | [0m<CMU_MOSI_ModifiedTimestampedWords> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:34.476] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_ModifiedTimestampedWords> computational sequence ...
[92m[1m[2023-09-24 17:59:34.476] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_41> computational sequence.
[94m[1m[2023-09-24 17:59:34.476] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_41> computational sequence ...


                                                                      

[92m[1m[2023-09-24 17:59:34.517] | Success | [0m<CMU_MOSI_Visual_Facet_41> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:34.517] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_41> computational sequence ...
[92m[1m[2023-09-24 17:59:34.518] | Success | [0mInitialized empty <CMU_MOSI_COVAREP> computational sequence.
[94m[1m[2023-09-24 17:59:34.518] | Status  | [0mChecking the format of the data in <CMU_MOSI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2023-09-24 17:59:34.556] | Success | [0m<CMU_MOSI_COVAREP> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:34.556] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_COVAREP> computational sequence ...


In [11]:
label_field = 'CMU_MOSI_Opinion_Labels'
label_recipe = {label_field: os.path.join(DATA_PATH, label_field+'.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2023-09-24 17:59:34.599] | Success | [0mComputational sequence read from file .src/datasets/CMU-MultimodalSDK/data/CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2023-09-24 17:59:34.605] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2023-09-24 17:59:34.605] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2023-09-24 17:59:34.619] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:34.619] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[94m[1m[2023-09-24 17:59:34.619] | Status  | [0mUnify was called ...
[92m[1m[2023-09-24 17:59:34.651] | Success | [0mUnify completed ...
[94m[1m[2023-09-24 17:59:34.653] | Status  | [0mPre-alignment based on <CMU_MOSI_Opinion_Labels> computational sequence started ...
[94m[1m[2023-09-24 17:59:34.712] | Status  | [0mPre-alignment done for <CMU_MOSI_COVAREP> ...
[94m[1m[2023-09-24 17:59:34.770] | Status  | [0mPre-alignment done for <CMU_MOSI_ModifiedTimestampedWords> ...
[94m[1m[2023-09-24 17:59:34.830] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_41> ...
[94m[1m[2023-09-24 17:59:34.832] | Status  | [0mAlignment starting ...


                                                                                              

[92m[1m[2023-09-24 17:59:35.897] | Success | [0mAlignment to <CMU_MOSI_Opinion_Labels> complete.
[94m[1m[2023-09-24 17:59:35.897] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2023-09-24 17:59:35.959] | Success | [0mInitialized empty <CMU_MOSI_ModifiedTimestampedWords> computational sequence.
[94m[1m[2023-09-24 17:59:35.959] | Status  | [0mChecking the format of the data in <CMU_MOSI_ModifiedTimestampedWords> computational sequence ...


                                                                     

[92m[1m[2023-09-24 17:59:35.961] | Success | [0m<CMU_MOSI_ModifiedTimestampedWords> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:35.961] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_ModifiedTimestampedWords> computational sequence ...
[92m[1m[2023-09-24 17:59:35.961] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_41> computational sequence.
[94m[1m[2023-09-24 17:59:35.961] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_41> computational sequence ...


                                                                     

[92m[1m[2023-09-24 17:59:35.963] | Success | [0m<CMU_MOSI_Visual_Facet_41> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:35.963] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_41> computational sequence ...
[92m[1m[2023-09-24 17:59:35.963] | Success | [0mInitialized empty <CMU_MOSI_COVAREP> computational sequence.
[94m[1m[2023-09-24 17:59:35.963] | Status  | [0mChecking the format of the data in <CMU_MOSI_COVAREP> computational sequence ...


                                                                     

[92m[1m[2023-09-24 17:59:35.966] | Success | [0m<CMU_MOSI_COVAREP> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:35.966] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_COVAREP> computational sequence ...
[92m[1m[2023-09-24 17:59:35.966] | Success | [0mInitialized empty <CMU_MOSI_Opinion_Labels> computational sequence.
[94m[1m[2023-09-24 17:59:35.966] | Status  | [0mChecking the format of the data in <CMU_MOSI_Opinion_Labels> computational sequence ...


                                                                     

[92m[1m[2023-09-24 17:59:35.969] | Success | [0m<CMU_MOSI_Opinion_Labels> computational sequence data in correct format.
[94m[1m[2023-09-24 17:59:35.969] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Opinion_Labels> computational sequence ...




In [12]:
# check out what the keys look like now
print(list(dataset[label_field].keys())[55])

1iG0909rllw[3]


## Spliting dataset and normalizing data

In [13]:
# Spliting dataset into train, test and evaluation sets
DATASET = md.cmu_mosi

train_set = DATASET.standard_folds.standard_train_fold
valid_set = DATASET.standard_folds.standard_valid_fold
test_set = DATASET.standard_folds.standard_test_fold

print(f"Shape of training set: {len(train_set)}")
print(f"Shape of validation set: {len(valid_set)}")
print(f"Shape of test set: {len(test_set)}")

Shape of training set: 52
Shape of validation set: 10
Shape of test set: 31


In [14]:
print(test_set)

['tmZoasNr4rU', 'zhpQhgha_KU', 'lXPQBPVc5Cw', 'iiK8YX8oH1E', 'tStelxIAHjw', 'nzpVDcQ0ywM', 'etzxEpPuc6I', 'cW1FSBF59ik', 'd6hH302o4v8', 'k5Y_838nuGo', 'pLTX3ipuDJI', 'jUzDDGyPkXU', 'f_pcplsH_V0', 'yvsjCA6Y5Fc', 'nbWiPyCm4g0', 'rnaNMUZpvvg', 'wMbj6ajWbic', 'cM3Yna7AavY', 'yDtzw_Y-7RU', 'vyB00TXsimI', 'dq3Nf_lMPnE', 'phBUpBr1hSo', 'd3_k5Xpfmik', 'v0zCBqDeKcE', 'tIrG4oNLFzE', 'fvVhgmXxadc', 'ob23OKe5a9Q', 'cXypl4FnoZo', 'vvZ4IcEtiZc', 'f9O3YtZ2VfI', 'c7UH_rxdZv4']


In [15]:
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from collections import defaultdict

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 0

# construct a word2id mapping that automatically takes increment when new words are encountered
word2id = defaultdict(lambda: len(word2id))
UNK = word2id['<unk>']
PAD = word2id['<pad>']

# place holders for the final train/dev/test dataset
train = []
validation = []
test = []

# define a regular expression to extract the video ID out of the keys
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues

for segment in dataset[label_field].keys():

    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[text_field][segment]['features']
    _visual = dataset[visual_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    visual = []
    acoustic = []
    for i, word in enumerate(_words):
        if word[0] != b'sp':
            words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
            visual.append(_visual[i, :])
            acoustic.append(_acoustic[i, :])

    words = np.asarray(words)
    visual = np.asarray(visual)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))

    if vid in train_set:
        train.append(((words, visual, acoustic), label, segment))
    elif vid in valid_set:
        validation.append(((words, visual, acoustic), label, segment))
    elif vid in test_set:
        test.append(((words, visual, acoustic), label, segment))
    else:
        print(f"Found video that doesn't belong to any splits: {vid}")

print(f"Total number of {num_drop} datapoints have been dropped.")

# turn off the word2id - define a named function here to allow for pickling
def return_unk():
    return UNK
word2id.default_factory = return_unk


  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  x = um.multiply(x, x, out=x)
  visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Total number of 0 datapoints have been dropped.


In [16]:
print(80*"=")
print(f'Train set aligned with label added: {len(train)}') 
print(f'Test set aligned with label added: {len(test)}') 
print(f'Validation set aligned with label added: {len(validation)}') 
print(80*"=")
print(f'Shape into sequences: {len(train[0])}') 
print(f'Shape into batch: {len(train[0][0])}') 
print(80*"=")
print(f'Text vector shape: {train[0][0][0].shape}')
print(f'Visual vector shape: {train[0][0][1].shape}')
print(f'Acoustic vector shape: {train[0][0][2].shape}')


Train set aligned with label added: 1283
Test set aligned with label added: 686
Validation set aligned with label added: 229
Shape into sequences: 3
Shape into batch: 3
Text vector shape: (5,)
Visual vector shape: (5, 47)
Acoustic vector shape: (5, 74)


In [17]:
train[0][0]

(array([2, 3, 4, 5, 6]),
 array([[-1.99193895e+00, -1.56130433e+00,  2.24037215e-01,
          2.24037215e-01, -1.30791950e+00, -5.24096489e-01,
          1.08224618e+00,  4.84659851e-01,  6.20539188e-01,
         -8.82908583e-01,  1.75792313e+00,  4.26066786e-01,
         -8.13734174e-01,  1.10976946e+00,  4.64716285e-01,
         -1.12349319e+00, -7.46687233e-01, -5.96176326e-01,
          1.65984178e+00,  6.28019050e-02,  8.35817695e-01,
         -9.04375374e-01,  1.90320683e+00,  2.73384601e-01,
         -7.74063706e-01,  1.00022411e+00,  5.14332838e-02,
         -7.44960308e-01,  3.13304812e-01,  6.73187897e-02,
          8.88872266e-01, -1.79359889e+00, -1.04357433e+00,
          1.11288726e+00, -2.41447568e-01, -5.12846828e-01,
         -1.19174254e+00,  6.11311018e-01, -5.77251315e-01,
         -8.56405020e-01, -1.29991710e+00,  5.23986220e-01,
         -1.32399571e+00, -9.65939343e-01,  1.30997336e+00,
          1.44799495e+00,  9.02126491e-01],
        [ 5.74711084e-01,  1.06

In [18]:
def multi_collate(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)

    # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
    #print([torch.from_numpy(sample[0]) for sample in batch])
    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
    visual = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch])
    sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)
    acoustic = pad_sequence([torch.FloatTensor(sample[0][2]) for sample in batch])

    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    return sentences, visual, acoustic, labels, lengths

# construct dataloaders, dev and test could use around ~X3 times batch size since no_grad is used during eval
batch_sz = 64
train_loader = DataLoader(train, shuffle=True, batch_size=batch_sz, collate_fn=multi_collate)
validation_loader = DataLoader(validation, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)
test_loader = DataLoader(test, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)


# let's create a temporary dataloader just to see how the batch looks like
temp_loader = iter(DataLoader(test, shuffle=True, batch_size=8, collate_fn=multi_collate))
batch = next(temp_loader)


print(80*"=")
print(f"Batch shape: {len(batch)}") # word vectors, padded to maxlen
print(80*"=")
print(f"Text shape: {batch[0].shape}") # word vectors, padded to maxlen
print(f"Visual shape: {batch[1].shape}") # visual features
print(f"Acoustic shape: {batch[2].shape}") # acoustic features
print(80*"=")
print(f"Labels: {batch[3]}") # labels
print(f"Length: {batch[4]}") # lengths
print(80*"=")

Batch shape: 5
Text shape: torch.Size([78, 8])
Visual shape: torch.Size([78, 8, 47])
Acoustic shape: torch.Size([78, 8, 74])
Labels: tensor([[-2.0000],
        [ 2.6000],
        [ 1.4000],
        [-0.4000],
        [ 0.6000],
        [ 2.2000],
        [ 0.8000],
        [ 1.2000]])
Length: tensor([78, 26, 22, 16, 10,  7,  6,  5])


In [19]:
id2word = {v:k for k,v in word2id.items()}
examine_target = train
idx = np.random.randint(0, len(examine_target))
print(' '.join(list(map(lambda x: id2word[x], examine_target[idx][0][0].tolist()))))
# print(' '.join(examine_target[idx][0]))
print(examine_target[idx][1])
print(examine_target[idx][2])

incredible steve carell voices gru and russell brand actually really impressed me with his old man voice for the
[[1.8]]
9J25DZhivz8[9]


## Basic LateFusionLSTM Multimodal model

In [21]:

class LateFusionLSTM(nn.Module): 
    '''
    The LateFusionLSTM class is a PyTorch module that implements a late fusion LSTM 
    model for multimodal data fusion. It takes as input three types of data: text, 
    visual, and acoustic, and combines them using LSTM layers and concatenation. 
    The model then applies fully connected layers to produce the final output.

    - How to use?
        model = LateFusionLSTM(input_sizes=[100, 200, 300], 
                            hidden_sizes=[50, 50, 50], 
                            fc1_size=100, output_size=10, 
                            dropout_rate=0.5)


    '''
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate):
        super(LateFusionLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate

        # defining modules - two layer bidirectional LSTM with layer norm in between
        self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(2*hidden_sizes[0], hidden_sizes[0], bidirectional=True)

        self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(2*hidden_sizes[1], hidden_sizes[1], bidirectional=True)

        self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(2*hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        self.fc1 = nn.Linear(sum(hidden_sizes)*4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0]*2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1]*2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2]*2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes)*4)


    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2


    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        sentences = self.embed(sentences)

        # extract features from text modality
        final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)

        # extract features from visual modality
        final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)

        # extract features from acoustic modality
        final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)


        # simple late fusion -- concatenation + normalization
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
                       dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)
        return o
    

# BERT LSTM architecture

In [22]:
import torch.nn.functional as F
from transformers import BertModel

class BertLateFusionLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate, bert_model):
        super(BertLateFusionLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate

        # defining modules - two layer bidirectional LSTM with layer norm in between
        self.bert = BertModel.from_pretrained(bert_model)
        for param in self.bert.parameters():
            param.requires_grad = False
        self.trnn1 = nn.LSTM(self.bert.config.hidden_size, hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(2*hidden_sizes[0], hidden_sizes[0], bidirectional=True)

        self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(2*hidden_sizes[1], hidden_sizes[1], bidirectional=True)

        self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(2*hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        self.fc1 = nn.Linear(sum(hidden_sizes)*4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0]*2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1]*2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2]*2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes)*4)

    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        with torch.no_grad():
            bert_output = self.bert(sequence)[0]
        packed_sequence = pack_padded_sequence(bert_output, lengths)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)

        # extract features from text modality
        final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)

        # extract features from visual modality
        final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)

        # extract features from acoustic modality
        final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        # simple late fusion -- concatenation + normalization
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
                       dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)
        return o

In [23]:
def load_emb(w2i, path_to_embedding, embedding_size=300, embedding_vocab=2196017, init_emb=None):
    if init_emb is None:
        emb_mat = np.random.randn(len(w2i), embedding_size)
    else:
        emb_mat = init_emb
    f = open(path_to_embedding, 'r')
    found = 0
    for line in tqdm_notebook(f, total=embedding_vocab):
        content = line.strip().split()
        vector = np.asarray(list(map(lambda x: float(x), content[-300:])))
        word = ' '.join(content[:-300])
        if word in w2i:
            idx = w2i[word]
            emb_mat[idx, :] = vector
            found += 1
    print(f"Found {found} words in the embedding file.")
    tensor_emb  = torch.tensor(emb_mat).float()
    return tensor_emb

## Utils functions

In [24]:
from sklearn.metrics import accuracy_score
def calculate_accuracy(y_pred, y_true):
    y_pred_bin = (y_pred.detach().cpu().numpy() >= 0)
    y_true_bin = (y_true.detach().cpu().numpy() >= 0)
    return accuracy_score(y_true_bin, y_pred_bin)

In [27]:
from tqdm import tqdm_notebook
from torch.optim import Adam, SGD
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import wandb


dropout_values = [0.4, 0.45]
# Define the model class to use
#name_model = BertLateFusionLSTM
name_model = LateFusionLSTM  # This is the class type, not an instance

for drop_value in dropout_values:
    print(drop_value)
    wandb.init(settings=wandb.Settings(start_method="fork"),project="Baseline-multimodal")

    # Parameters
    torch.manual_seed(123)
    torch.cuda.manual_seed_all(123)

    CUDA = torch.cuda.is_available()
    MAX_EPOCH = 100

    text_size = 300
    visual_size = 47
    acoustic_size = 74

    # define some model settings and hyper-parameters
    input_sizes = [text_size, visual_size, acoustic_size]
    hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
    fc1_size = sum(hidden_sizes) // 2
    dropout = drop_value
    output_size = 1
    curr_patience = patience = 10
    num_trials = 3
    grad_clip_value = 1.3
    weight_decay = 0.15

    # Configurations
    config = wandb.config
    config.text_size = text_size
    config.visual_size = visual_size
    config.acoustic_size = acoustic_size
    config.hidden_sizes = hidden_sizes
    config.dropout = dropout
    config.output_size = output_size
    config.patience = patience
    config.grad_clip_value = grad_clip_value
    config.weight_decay = weight_decay
    config.batch_size = batch_sz

    if os.path.exists(CACHE_PATH):
        pretrained_emb, word2id = torch.load(CACHE_PATH)
    elif WORD_EMB_PATH is not None:
        pretrained_emb = load_emb(word2id, WORD_EMB_PATH)
        torch.save((pretrained_emb, word2id), CACHE_PATH)
    else:
        pretrained_emb = None

    # Create an instance of the model based on the class type
    if name_model == LateFusionLSTM:
        print("Using LateFusionLSTM")
        model = LateFusionLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
        if pretrained_emb is not None:
            model.embed.weight.data = pretrained_emb
            model.embed.requires_grad = False
    elif name_model == BertLateFusionLSTM:
        print("Using BertLateFusionLSTM")
        model = BertLateFusionLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout, bert_model='bert-base-uncased')

    optimizer = Adam([param for param in model.parameters() if param.requires_grad], weight_decay=weight_decay)


    if CUDA:
        model.cuda()
    criterion = nn.L1Loss(reduction='sum')
    criterion_test = nn.L1Loss(reduction='sum')
    best_valid_loss = float('inf')
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
    lr_scheduler.step() # for some reason it seems the StepLR needs to be stepped once first

    train_losses = []
    valid_losses = []

    # Training 
    for e in range(MAX_EPOCH):
        # TRAINING MOOD
        model.train()
        train_iter = tqdm(train_loader)
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        valid_correct = 0
        valid_total = 0  # Reset valid_total here
        for batch in train_iter:
            model.zero_grad()
            t, v, a, y, l = batch
            batch_size = t.size(0)
            if CUDA:
                t = t.cuda()
                v = v.cuda()
                a = a.cuda()
                y = y.cuda()
                l = l.cuda()
            y_tilde = model(t, v, a, l)
            loss = criterion(y_tilde, y)
            loss.backward()
            torch.nn.utils.clip_grad_value_([param for param in model.parameters() if param.requires_grad], grad_clip_value)
            optimizer.step()
            
            # Calculate accuracy
            accuracy = calculate_accuracy(y_tilde, y)
            train_total += batch_size
            train_correct += accuracy * batch_size
            train_iter.set_description(f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item()/batch_size, 4)}")
            train_loss += loss.item()
            wandb.log({"Batch Loss": round(loss.item()/batch_size, 4)})
            wandb.log({"Batch Accuracy": accuracy})


        train_loss = train_loss / len(train)
        train_losses.append(train_loss)
        train_accuracy = train_correct / train_total

        # Training tracks metrics per epoch
        wandb.log({"Training Loss": train_loss})
        wandb.log({"Training Accuracy": train_accuracy})
        wandb.log({"Epoch": e})
        print(f"[-] Training loss: {round(train_loss, 4)}")
        print(f"[-] Training accuracy: {train_accuracy}")

        # VALIDATION MOOD
        model.eval()
        with torch.no_grad():
            valid_loss = 0.0
            for batch in validation_loader:
                model.zero_grad()
                t, v, a, y, l = batch
                if CUDA:
                    t = t.cuda()
                    v = v.cuda()
                    a = a.cuda()
                    y = y.cuda()
                    l = l.cuda()
                y_tilde = model(t, v, a, l)
                loss = criterion(y_tilde, y)
                # calculate accuracy in validation and log in wandb
                valid_total += batch_size  # Reset valid_total at the beginning of each epoch
                valid_loss += loss.item()
                accuracy = calculate_accuracy(y_tilde, y)
                valid_correct += accuracy * batch_size




        valid_loss = valid_loss/len(validation)
        valid_losses.append(valid_loss)
        valid_accuracy = valid_correct / valid_total

        wandb.log({"Validation Accuracy": valid_accuracy})
        wandb.log({"Validation Loss": valid_loss})
        print(f"Validation loss: {round(valid_loss, 4)}")
        print(f"Validation accuracy: {valid_accuracy}")
        print(f"Current patience: {curr_patience}, current trial: {num_trials}.")


        if valid_loss <= best_valid_loss:
            best_valid_loss = valid_loss
            print("Found new best model on dev set!")
            torch.save(model.state_dict(), 'model.std')
            torch.save(optimizer.state_dict(), 'optim.std')
            curr_patience = patience
        else:
            curr_patience -= 1
            if curr_patience <= -1:
                print("Running out of patience, loading previous best model.")
                num_trials -= 1
                curr_patience = patience
                model.load_state_dict(torch.load('model.std'))
                optimizer.load_state_dict(torch.load('optim.std'))
                lr_scheduler.step()
                print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")

        if num_trials <= 0:
            print("Running out of patience, early stopping.")
            break

    model.load_state_dict(torch.load('model.std'))


    # TEST MODE
    y_true = []
    y_pred = []

    # Test
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for batch in test_loader:
            model.zero_grad()
            t, v, a, y, l = batch
            if CUDA:
                t = t.cuda()
                v = v.cuda()
                a = a.cuda()
                y = y.cuda()
                l = l.cuda()
            y_tilde = model(t, v, a, l)
            loss = criterion_test(y_tilde, y)
            y_true.append(y_tilde.detach().cpu().numpy())
            y_pred.append(y.detach().cpu().numpy())
            test_loss += loss.item()

    print(f"Test set performance: {test_loss/len(test)}")
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    y_true_bin = y_true >= 0
    y_pred_bin = y_pred >= 0
    bin_acc = accuracy_score(y_true_bin, y_pred_bin)
    wandb.log({"Test Loss": test_loss/len(test), "Test Accuracy": bin_acc})
    print(f"Test set accuracy is {bin_acc}")
    
    wandb.finish()


0.4




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167635188515608, max=1.0…



Using LateFusionLSTM


Epoch 0/100, current batch loss: 2.0095:  90%|█████████ | 19/21 [00:11<00:01,  1.89it/s]

In [71]:
wandb.finish()

In [58]:
dropout_values = [0.4, 0.45, 0.5, 0.55, 0.6]
for drop_value in dropout_values:
    print(drop_value)

0.4
0.45
0.5
0.55
0.6
