In [1]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys
import os
import requests
import numpy as np
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

# Append the SDK path
if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

# Create folder for storing data if it doesn't exist
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH, exist_ok=True)

# Helper function to download a file from a URL
def download_file(url, dest):
    try:
        print(f"Downloading from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(dest, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {dest}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Function to get the filename from the URL (last segment)
def get_filename_from_url(url):
    return url.split('/')[-1]  # Extract the last part of the URL

# Download function that iterates over dataset features
def download_dataset_features(feature_dict, feature_type):
    for feature_name, url in feature_dict.items():
        print(f"\nProcessing {feature_type} feature: {feature_name}")
        
        # Use the filename derived from the URL instead of the dictionary key
        filename = get_filename_from_url(url)
        dest_path = os.path.join(DATA_PATH, filename)

        if not os.path.exists(dest_path):
            download_file(url, dest_path)
        else:
            print(f"{feature_type} feature '{feature_name}' already downloaded at {dest_path}")

# Dataset initialization
DATASET = md.cmu_mosei
SETUP = True
# Process high-level, raw, and label features
# try:
#     print("\nStarting download of high-level features...")
#     download_dataset_features(DATASET.highlevel, "High-level")
# except Exception as e:
#     print(f"Error downloading high-level features: {e}")

# try:
#     print("\nStarting download of raw features...")
#     download_dataset_features(DATASET.raw, "Raw")
# except Exception as e:
#     print(f"Error downloading raw features: {e}")

# try:
#     print("\nStarting download of labels...")
#     download_dataset_features(DATASET.labels, "Label")
# except Exception as e:
#     print(f"Error downloading labels: {e}")

In [2]:
# list the directory contents... let's see what features there are
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSEI_COVAREP.csd
CMU_MOSEI_Labels.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_VisualFacet42.csd
glove.6B.300d.txt
GoogleNews-vectors-negative300.bin


We have multiple files which can be broadly classified into three categories, highlevel, raw and labels. 

<strong>Highlevel</strong> contains the extracted features for each modality (e.g OpenFace facial landmarks, openSMILE acoustic features) while <strong>raw</strong> contains the raw transctripts, phonemes. 

We have multiple files with the .csd extension. This stands for <strong>computational sequences</strong>, which is the underlying data structure for all features in the SDK. 

<strong> Highlevel features: </strong>
- CMU_MOSEI_VisualFacet42.csd (Video modality)
- CMU_MOSEI_VisualOpenFace2.csd (Video Modality)
- CMU_MOSEI_COVAREP.csd (Audio Modality)
- CMU_MOSEI_TimestampedWordVectors.csd (Text Modality)

## Loading the data

In [3]:
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'
word_field = 'CMU_MOSEI_TimestampedWords'
text_field = 'CMU_MOSEI_TimestampedWordVectors'

features = [
    # text_field, 
    visual_field, 
    acoustic_field
]

raw_features = [word_field]

# recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
# dataset = md.mmdataset(recipe)

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
recipe[word_field] = os.path.join(DATA_PATH, word_field) + '.csd'
print(recipe)
dataset = md.mmdataset(recipe)


{'CMU_MOSEI_VisualFacet42': './data/CMU_MOSEI_VisualFacet42.csd', 'CMU_MOSEI_COVAREP': './data/CMU_MOSEI_COVAREP.csd', 'CMU_MOSEI_TimestampedWords': './data/CMU_MOSEI_TimestampedWords.csd'}
[92m[1m[2024-11-06 06:06:34.408] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_VisualFacet42.csd ...
[94m[1m[2024-11-06 06:06:37.723] | Status  | [0mChecking the integrity of the <FACET 4.2> computational sequence ...
[94m[1m[2024-11-06 06:06:37.723] | Status  | [0mChecking the format of the data in <FACET 4.2> computational sequence ...


                                                                                 

[92m[1m[2024-11-06 06:06:51.066] | Success | [0m<FACET 4.2> computational sequence data in correct format.
[94m[1m[2024-11-06 06:06:51.066] | Status  | [0mChecking the format of the metadata in <FACET 4.2> computational sequence ...
[92m[1m[2024-11-06 06:06:51.070] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-11-06 06:06:55.748] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-11-06 06:06:55.748] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                 

[92m[1m[2024-11-06 06:07:06.388] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-11-06 06:07:06.388] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-11-06 06:07:06.395] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-11-06 06:07:09.620] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-11-06 06:07:09.620] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                 

[92m[1m[2024-11-06 06:07:20.547] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-11-06 06:07:20.547] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-11-06 06:07:20.547] | Success | [0mDataset initialized successfully ... 




To load the dataset, we need to tell the SDK which features we need and where they exist. Thus, we construct a dictionary with format {feature_name: csd_path} and feed it to mmdataset object in the SDK.

From the highlevel features, VisualFacet is used for video modality since this file stores facial expression data extracted using the FACET tool. FACET analyzes microexpressions, including movements of facial muscles (like eyebrow raises or smiles) and emotional states (e.g., joy, anger). Thus, making it more suitable for emotion detection than openFace since it
tracks facial landmarks, head poses, and eye gaze, offering detailed spatial and motion-related facial features across video frames.

COVAREP is used for audio related features and TimeStampedWordVectors provides Pre-trained embeddings using GLoVe capture semantic relationships and contextual meaning between words.

In [None]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[visual_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(list(dataset[visual_field][some_id].keys()))
print("=" * 80)

word_id = list(dataset[word_field].keys())[15]
print(list(dataset[word_field][word_id].keys()))
print("=" * 80)
print(dataset[word_field].keys())

print('Intervals')
print(list(dataset[visual_field][some_id]['intervals'].shape))
#print(list(dataset[text_field][some_id]['intervals'].shape))
print(list(dataset[word_field][word_id]['intervals'].shape))
print(list(dataset[acoustic_field][some_id]['intervals'].shape))
print("=" * 80)

print('Features')
print(list(dataset[visual_field][some_id]['features'].shape))
#print(list(dataset[text_field][some_id]['features'].shape))
print(list(dataset[word_field][word_id]['features'].shape))
print(list(dataset[acoustic_field][some_id]['features'].shape))
print("Different modalities have different number of time steps!")

['CMU_MOSEI_VisualFacet42', 'CMU_MOSEI_COVAREP', 'CMU_MOSEI_TimestampedWords']
['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY']
['features', 'intervals']
['features', 'intervals']
dict_keys(['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY', '-HeZS2-Prhc', '-HvKLjmsO5U', '-HwX2H8Z4hY', '-IUUR2yyNbw', '-I_e4mIh0yE', '-IqSFQePnpU', '-KCahx2qBOI', '-LnuDPiuuZw', '-MeTTeMJBNc', '-NFrJFQijFE', '-RfYyzHpjk4', '-RpZEe4w4fY', '-SYSVSQnTnA', '-THoVjtIkeU', '-UUCSKoHeMA', '-UacrmKiTn4', '-UuX1xuaiiE', '-VmheDA92mM', '-WXXTNIJcVM', '-ZgjBOA1Yhw', '-a55Q6RWvTA', '-aNfi7CP8vM', '-aqamKhZ1Ec', '-bl5PfNIYrk', '-cEhr0cQcDM', '-cmk6cfUeMs', '-dZ1TCboxcQ', '-dxfTGcXJoc', '-egA8-b7-3M', '-hPfPhUIzfA', '-hnBHBN8p5A', '-iRBcNs9oI8', '-l_53IwQoj0', '-lqc32Zpr7M', '-lzEya4AM_4', '-m9KtvCk_L8', '-mJ2ud6oKI8', '-

: 

In [None]:
if SETUP:
# we define a simple averaging function that does not depend on intervals
    def avg(intervals: np.array, features: np.array) -> np.array:
        try:
            return np.average(features, axis=0)
        except:
            return features

    # first we align to words with averaging, collapse_function receives a list of functions
    dataset.align(word_field, collapse_functions=[avg])

[94m[1m[2024-11-06 06:07:20.593] | Status  | [0mUnify was called ...
[92m[1m[2024-11-06 06:07:20.604] | Success | [0mUnify completed ...
[94m[1m[2024-11-06 06:07:20.604] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWords> computational sequence started ...


In [None]:
# if SETUP:
#     deploy_files={x:x for x in dataset.keys()}
#     dataset.deploy("hl1",deploy_files)

In [None]:
label_field = 'CMU_MOSEI_Labels'

# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2024-11-04 16:39:33.228] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-11-04 16:39:33.949] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-11-04 16:39:33.950] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-11-04 16:39:35.633] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-11-04 16:39:35.633] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[94m[1m[2024-11-04 16:39:35.633] | Status  | [0mUnify was called ...
[92m[1m[2024-11-04 16:43:21.861] | Success | [0mUnify completed ...
[94m[1m[2024-11-04 16:43:21.892] | Status  | [0mPre-alignment based on <CMU_MOSEI_Labels> computational sequence started ...
[94m[1m[2024-11-04 16:43:24.335] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-11-04 16:43:26.565] | Status  | [0mPre-alignment done for <CMU_MOSEI_TimestampedWords> ...
[94m[1m[2024-11-04 16:43:28.880] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-11-04 16:43:28.950] | Status  | [0mAlignment starting ...


                                                                                                   

[92m[1m[2024-11-04 16:44:00.940] | Success | [0mAlignment to <CMU_MOSEI_Labels> complete.
[94m[1m[2024-11-04 16:44:00.940] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-11-04 16:44:03.762] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2024-11-04 16:44:03.762] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.793] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.793] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2024-11-04 16:44:03.793] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-11-04 16:44:03.793] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.821] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.822] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
[92m[1m[2024-11-04 16:44:03.822] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2024-11-04 16:44:03.822] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.851] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.851] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...
[92m[1m[2024-11-04 16:44:03.851] | Success | [0mInitialized empty <CMU_MOSEI_Labels> computational sequence.
[94m[1m[2024-11-04 16:44:03.851] | Status  | [0mChecking the format of the data in <CMU_MOSEI_Labels> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.886] | Success | [0m<CMU_MOSEI_Labels> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.886] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_Labels> computational sequence ...


In [11]:
# Print some keys to see the segmentation structure
print(list(dataset[label_field].keys())[:10])

# Pick a specific segmented key to explore further
some_segmented_key = list(dataset[label_field].keys())[0]

# Check the aligned features and intervals for this segment
print("Label intervals:", dataset[label_field][some_segmented_key]['intervals'].shape)
print("Label features:", dataset[label_field][some_segmented_key]['features'].shape)

print("Text features:", dataset[word_field][some_segmented_key]['features'].shape)
print("Visual features:", dataset[visual_field][some_segmented_key]['features'].shape)
print("Acoustic features:", dataset[acoustic_field][some_segmented_key]['features'].shape)


['--qXJuDtHPw[0]', '-3g5yACwYnA[0]', '-3g5yACwYnA[1]', '-3g5yACwYnA[2]', '-3g5yACwYnA[3]', '-3g5yACwYnA[4]', '-3g5yACwYnA[5]', '-3nNcZdcdvU[0]', '-3nNcZdcdvU[1]', '-3nNcZdcdvU[2]']
Label intervals: (1, 2)
Label features: (1, 7)
Text features: (22, 1)
Visual features: (22, 35)
Acoustic features: (22, 74)


In [12]:
# check out what the keys look like now
print(list(dataset[word_field].keys())[55])

-HwX2H8Z4hY[4]


## Train Test Split

In [14]:
# obtain the train/dev/test splits - these splits are based on video IDs
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

# inspect the splits: they only contain video IDs
print(test_split)

['7l3BNtSE0xc', 'dZFV0lyedX4', '286943', '126872', 'qgC8_emxSIU', 'kld9r0iFkWM', 'rC29Qub0U7A', '4YfyP0uIqw0', 'FMenDv3y8jc', '4wLP4elp1uM', 'KYQTwFVBzME', '27v7Blr0vjw', 'DnBHq5I52LM', 'HR18U0yAlTc', 'x266rUJQC_8', 'd1CDP6sMuLA', 'xSCvspXYU9k', '4EDblUpJieU', '4o4ilPK9rl8', '53609', 'SZ7HK5ns6mE', '243981', 'ySblgk7T7eQ', 'MYEyQUpMe3k', 'EujJ0SwiCRE', '3HyAaqre_Fk', 'iQDB_OkAQWs', 'gE7kUqMqQ9g', 'eFV7iFPYZB4', 'IRSxo_XXArg', '3hOlJf_JQDs', 'BRSyH6yfDLk', '1jogeKX0wGw', '3At-BKm9eYk', 'NVLPURuAVLU', 'pZye4zFzk3o', 'l1jW3OMXUzs', 'XKyumlBmix8', 'eKQKEi2-0Ws', 'WgI8IbJtXHw', 'tnWmVXZ87h0', 'YCEllKyaCrc', 'W1CWpktWtTs', '8wQhzezNcUY', '0bxhZ-LIfZY', 'lrjm6F3JJgg', 'Vdf1McvE9ao', 'eQc5uI7FKCU', '2QXHdu2zlQY', 'YCI-ZzclIPQ', '2Ky9DBSl49w', 'SKTyBOhDX6U', 'b86B3hP8ARM', '23656', 'kpS4BXif_Sw', 'dR68gbeOWOc', 'tC2KicUHB9Q', 'absh1hsZeF0', 'c5zxqITn3ZM', 'uogwnZGb-iE', '46495', 'Sq6DIhFxPqQ', 'PexNiFbPTYM', 'z441aDJvAcU', 'OORklkFql3k', 'WbtsuXkaGeg', 'grsV1YN1z5s', 'Gc_zIjqqUys', '424SXFTCFsA

In [34]:
# we can see they are in the format of 'video_id[segment_no]', but the splits was specified with video_id only
# we need to use regex or something to match the video IDs...
import re
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from collections import defaultdict

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 0

# construct a word2id mapping that automatically takes increment when new words are encountered
word2id = defaultdict(lambda: len(word2id))
UNK = word2id['<unk>']
PAD = word2id['<pad>']

# place holders for the final train/dev/test dataset
train = []
dev = []
test = []

# define a regular expression to extract the video ID out of the keys
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues

problematic_segments = {"HuIKyKkEL0Q[0]","JGEEA_JVriE[0]","JGEEA_JVriE[1]","JGEEA_JVriE[2]","JGEEA_JVriE[3]","aa0J1AXSseY[4]","aa0J1AXSseY[5]","aa0J1AXSseY[6]","zsRTbbKlsEg[0]"}

for segment in dataset[label_field].keys():
    
    if segment in problematic_segments:
        print(f"Skipping problematic segment: {segment}")
        num_drop += 1
        continue
    
    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[word_field][segment]['features']
    _visual = dataset[visual_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    visual = []
    acoustic = []
    for i, word in enumerate(_words):
        if word[0] != b'sp':
            words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
            visual.append(_visual[i, :])
            acoustic.append(_acoustic[i, :])

    words = np.asarray(words)
    visual = np.asarray(visual)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))

    if vid in train_split:
        train.append(((words, visual, acoustic), label, segment))
    elif vid in dev_split:
        dev.append(((words, visual, acoustic), label, segment))
    elif vid in test_split:
        test.append(((words, visual, acoustic), label, segment))
    else:
        print(f"Found video that doesn't belong to any splits: {vid}")

print(f"Total number of {num_drop} datapoints have been dropped.")

# turn off the word2id - define a named function here to allow for pickling
def return_unk():
    return UNK
word2id.default_factory = return_unk

  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  x = um.multiply(x, x, out=x)
  visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7eu

In [58]:
# let's see the size of each set and shape of data
print(len(train))
print(len(dev))
print(len(test))

print(train[0][0][1].shape)
print(train[0][1].shape)
print(train[0][1])

print(f"Total vocab size: {len(word2id)}")

16315
1871
4654
(42, 35)
(1, 7)
[[1.        0.6666667 0.6666667 0.        0.        0.        0.6666667]]
Total vocab size: 16819



Collate function in PyTorch

Collate functions are functions used by PyTorch dataloader to gather batched data from dataset. It loads multiple data points from an iterable dataset object and put them in a certain format. Here we just use the lists we've constructed as the dataset and assume PyTorch dataloader will operate on that.


In [59]:
def multi_collate(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    
    # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
    sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)
    visual = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch])
    acoustic = pad_sequence([torch.FloatTensor(sample[0][2]) for sample in batch])
    
    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    return sentences, visual, acoustic, labels, lengths

# construct dataloaders, dev and test could use around ~X3 times batch size since no_grad is used during eval
batch_sz = 56
train_loader = DataLoader(train, shuffle=True, batch_size=batch_sz, collate_fn=multi_collate)
dev_loader = DataLoader(dev, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)
test_loader = DataLoader(test, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)

# let's create a temporary dataloader just to see how the batch looks like
temp_loader = iter(DataLoader(test, shuffle=True, batch_size=8, collate_fn=multi_collate))
batch = next(temp_loader)

print(batch[0].shape) # word vectors, padded to maxlen
print(batch[1].shape) # visual features
print(batch[2].shape) # acoustic features
print(batch[3]) # labels
print(batch[4]) # lengths

torch.Size([40, 8])
torch.Size([40, 8, 35])
torch.Size([40, 8, 74])
tensor([[ 0.6667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.6667,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.3333,  1.0000,  0.0000,  0.0000,  0.0000,  0.3333,  0.0000],
        [-0.3333,  0.0000,  0.3333,  0.3333,  0.0000,  0.0000,  0.0000],
        [-2.6667,  0.0000,  0.3333,  0.6667,  0.0000,  1.3333,  0.0000],
        [ 1.3333,  0.3333,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.6667,  1.6667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 2.6667,  2.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])
tensor([40, 34, 34, 28, 24, 22, 19, 14])


In [60]:
# Let's actually inspect the transcripts to ensure it's correct
id2word = {v:k for k, v in word2id.items()}
examine_target = train
idx = np.random.randint(0, len(examine_target))
print(' '.join(list(map(lambda x: id2word[x], examine_target[idx][0][0].tolist()))))
# print(' '.join(examine_target[idx][0]))
print(examine_target[idx][1])
print(examine_target[idx][2])

behavior and any marketer that wants to be successful needs to consider these and be able to adapt their programs to the changing needs of the consumer consumer
[[0.33333334 0.33333334 0.         0.         0.         0.
  0.        ]]
Ha7DMd_iKyM[0]


## Defining Multimodal model

In [61]:
class LFLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate):
        super(LFLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate
        
        # defining modules - two layer bidirectional LSTM with layer norm in between
        self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(2*hidden_sizes[0], hidden_sizes[0], bidirectional=True)
        
        self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(2*hidden_sizes[1], hidden_sizes[1], bidirectional=True)
        
        self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(2*hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        self.fc1 = nn.Linear(sum(hidden_sizes)*4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0]*2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1]*2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2]*2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes)*4)

        
    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

        
    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        sentences = self.embed(sentences)
        
        # extract features from text modality
        final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)
        
        # extract features from visual modality
        final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)
        
        # extract features from acoustic modality
        final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        
        # simple late fusion -- concatenation + normalization
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
                       dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)
        return o


- Load pretrained embeddings

We define a function for loading pretrained word embeddings stored in GloVe-style file. Contextualized embeddings obviously cannot be stored and loaded this way, though.


In [62]:
def load_emb(w2i, path_to_embedding, embedding_size=300, embedding_vocab=2196017, init_emb=None):
    if init_emb is None:
        emb_mat = np.random.randn(len(w2i), embedding_size)
    else:
        emb_mat = init_emb
    f = open(path_to_embedding, 'r')
    found = 0
    for line in tqdm_notebook(f, total=embedding_vocab):
        content = line.strip().split()
        vector = np.asarray(list(map(lambda x: float(x), content[-300:])))
        word = ' '.join(content[:-300])
        if word in w2i:
            idx = w2i[word]
            emb_mat[idx, :] = vector
            found += 1
    print(f"Found {found} words in the embedding file.")
    return torch.tensor(emb_mat).float()

In [None]:
from tqdm import tqdm_notebook
from torch.optim import Adam, SGD
from sklearn.metrics import accuracy_score

torch.manual_seed(123)
torch.cuda.manual_seed_all(123)

CUDA = torch.cuda.is_available()
MAX_EPOCH = 1000

text_size = 300
visual_size = 47
acoustic_size = 74

# define some model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size]
hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.25
output_size = 1
curr_patience = patience = 8
num_trials = 3
grad_clip_value = 1.0
weight_decay = 0.1

if os.path.exists(CACHE_PATH):
    pretrained_emb, word2id = torch.load(CACHE_PATH)
elif WORD_EMB_PATH is not None:
    pretrained_emb = load_emb(word2id, WORD_EMB_PATH)
    torch.save((pretrained_emb, word2id), CACHE_PATH)
else:
    pretrained_emb = None

model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
if pretrained_emb is not None:
    model.embed.weight.data = pretrained_emb
model.embed.requires_grad = False
optimizer = Adam([param for param in model.parameters() if param.requires_grad], weight_decay=weight_decay)

if CUDA:
    model.cuda()
criterion = nn.L1Loss(reduction='sum')
criterion_test = nn.L1Loss(reduction='sum')
best_valid_loss = float('inf')
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
lr_scheduler.step() # for some reason it seems the StepLR needs to be stepped once first
train_losses = []
valid_losses = []
for e in range(MAX_EPOCH):
    model.train()
    train_iter = tqdm_notebook(train_loader)
    train_loss = 0.0
    for batch in train_iter:
        model.zero_grad()
        t, v, a, y, l = batch
        batch_size = t.size(0)
        if CUDA:
            t = t.cuda()
            v = v.cuda()
            a = a.cuda()
            y = y.cuda()
            l = l.cuda()
        y_tilde = model(t, v, a, l)
        loss = criterion(y_tilde, y)
        loss.backward()
        torch.nn.utils.clip_grad_value_([param for param in model.parameters() if param.requires_grad], grad_clip_value)
        optimizer.step()
        train_iter.set_description(f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item()/batch_size, 4)}")
        train_loss += loss.item()
    train_loss = train_loss / len(train)
    train_losses.append(train_loss)
    print(f"Training loss: {round(train_loss, 4)}")

    model.eval()
    with torch.no_grad():
        valid_loss = 0.0
        for batch in dev_loader:
            model.zero_grad()
            t, v, a, y, l = batch
            if CUDA:
                t = t.cuda()
                v = v.cuda()
                a = a.cuda()
                y = y.cuda()
                l = l.cuda()
            y_tilde = model(t, v, a, l)
            loss = criterion(y_tilde, y)
            valid_loss += loss.item()
    
    valid_loss = valid_loss/len(dev)
    valid_losses.append(valid_loss)
    print(f"Validation loss: {round(valid_loss, 4)}")
    print(f"Current patience: {curr_patience}, current trial: {num_trials}.")
    if valid_loss <= best_valid_loss:
        best_valid_loss = valid_loss
        print("Found new best model on dev set!")
        torch.save(model.state_dict(), 'model.std')
        torch.save(optimizer.state_dict(), 'optim.std')
        curr_patience = patience
    else:
        curr_patience -= 1
        if curr_patience <= -1:
            print("Running out of patience, loading previous best model.")
            num_trials -= 1
            curr_patience = patience
            model.load_state_dict(torch.load('model.std'))
            optimizer.load_state_dict(torch.load('optim.std'))
            lr_scheduler.step()
            print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")
    
    if num_trials <= 0:
        print("Running out of patience, early stopping.")
        break

model.load_state_dict(torch.load('model.std'))
y_true = []
y_pred = []
model.eval()
with torch.no_grad():
    test_loss = 0.0
    for batch in test_loader:
        model.zero_grad()
        t, v, a, y, l = batch
        if CUDA:
            t = t.cuda()
            v = v.cuda()
            a = a.cuda()
            y = y.cuda()
            l = l.cuda()
        y_tilde = model(t, v, a, l)
        loss = criterion_test(y_tilde, y)
        y_true.append(y_tilde.detach().cpu().numpy())
        y_pred.append(y.detach().cpu().numpy())
        test_loss += loss.item()
print(f"Test set performance: {test_loss/len(test)}")
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)
                  
y_true_bin = y_true >= 0
y_pred_bin = y_pred >= 0
bin_acc = accuracy_score(y_true_bin, y_pred_bin)
print(f"Test set accuracy is {bin_acc}")

BTECH PROJECT - YOGINII & SAHIL 

Text-Unimodal

In [2]:
import h5py

def inspect_hdf5_structure(csd_path):
    with h5py.File(csd_path, 'r') as f:
        def print_structure(name, obj):
            print(f"{name}: {obj}")
        f.visititems(print_structure)

# Inspect the structure of the text modality file
inspect_hdf5_structure(TEXT_CSD_PATH)

glove_vectors: <HDF5 group "/glove_vectors" (2 members)>
glove_vectors/data: <HDF5 group "/glove_vectors/data" (3837 members)>
glove_vectors/data/--qXJuDtHPw: <HDF5 group "/glove_vectors/data/--qXJuDtHPw" (2 members)>
glove_vectors/data/--qXJuDtHPw/features: <HDF5 dataset "features": shape (183, 300), type "<f8">
glove_vectors/data/--qXJuDtHPw/intervals: <HDF5 dataset "intervals": shape (183, 2), type "<f8">
glove_vectors/data/-3g5yACwYnA: <HDF5 group "/glove_vectors/data/-3g5yACwYnA" (2 members)>
glove_vectors/data/-3g5yACwYnA/features: <HDF5 dataset "features": shape (435, 300), type "<f8">
glove_vectors/data/-3g5yACwYnA/intervals: <HDF5 dataset "intervals": shape (435, 2), type "<f8">
glove_vectors/data/-3nNcZdcdvU: <HDF5 group "/glove_vectors/data/-3nNcZdcdvU" (2 members)>
glove_vectors/data/-3nNcZdcdvU/features: <HDF5 dataset "features": shape (125, 300), type "<f8">
glove_vectors/data/-3nNcZdcdvU/intervals: <HDF5 dataset "intervals": shape (125, 2), type "<f8">
glove_vectors/data

In [4]:
import numpy as np
import h5py

# Define path to text modality file
TEXT_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_TimestampedWordVectors.csd"

# Load the text modality data directly without additional GloVe mapping
def load_text_data(csd_path):
    text_data = {}
    with h5py.File(csd_path, 'r') as f:
        # Access each video ID's features under "/glove_vectors/data"
        for video_id in f['/glove_vectors/data']:
            # Directly load precomputed embedding vectors
            text_data[video_id] = np.array(f[f'/glove_vectors/data/{video_id}/features'])
    print("Loaded text modality data.")
    return text_data

# Load and verify text data
text_data = load_text_data(TEXT_CSD_PATH)

# Example output for verification
print("Example processed data for one video:", list(text_data.items())[0])

Loaded text modality data.
Example processed data for one video: ('--qXJuDtHPw', array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.18733 ,  0.40595 , -0.51174 , ...,  0.16495 ,  0.18757 ,
         0.53874 ],
       [ 0.11527 ,  0.014791, -0.41083 , ...,  0.089616,  0.45656 ,
         0.17307 ],
       ...,
       [ 0.11527 ,  0.014791, -0.41083 , ...,  0.089616,  0.45656 ,
         0.17307 ],
       [-0.10545 , -0.11458 , -0.39382 , ..., -0.10723 ,  0.26568 ,
         0.062161],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]]))


In [6]:
import h5py

def inspect_hdf5_structure(csd_path):
    with h5py.File(csd_path, 'r') as f:
        def print_structure(name, obj):
            print(f"{name}: {obj}")
        f.visititems(print_structure)

# Inspect the structure of the labels file
inspect_hdf5_structure(LABEL_CSD_PATH)

All Labels: <HDF5 group "/All Labels" (2 members)>
All Labels/data: <HDF5 group "/All Labels/data" (3293 members)>
All Labels/data/--qXJuDtHPw: <HDF5 group "/All Labels/data/--qXJuDtHPw" (2 members)>
All Labels/data/--qXJuDtHPw/features: <HDF5 dataset "features": shape (1, 7), type "<f4">
All Labels/data/--qXJuDtHPw/intervals: <HDF5 dataset "intervals": shape (1, 2), type "<f8">
All Labels/data/-3g5yACwYnA: <HDF5 group "/All Labels/data/-3g5yACwYnA" (2 members)>
All Labels/data/-3g5yACwYnA/features: <HDF5 dataset "features": shape (6, 7), type "<f4">
All Labels/data/-3g5yACwYnA/intervals: <HDF5 dataset "intervals": shape (6, 2), type "<f8">
All Labels/data/-3nNcZdcdvU: <HDF5 group "/All Labels/data/-3nNcZdcdvU" (2 members)>
All Labels/data/-3nNcZdcdvU/features: <HDF5 dataset "features": shape (3, 7), type "<f4">
All Labels/data/-3nNcZdcdvU/intervals: <HDF5 dataset "intervals": shape (3, 2), type "<f8">
All Labels/data/-571d8cVauQ: <HDF5 group "/All Labels/data/-571d8cVauQ" (2 members)>

In [1]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Paths to data files
TEXT_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_TimestampedWordVectors.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load text modality data
def load_text_data(csd_path):
    text_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in f['/glove_vectors/data']:
            text_data[video_id] = np.array(f[f'/glove_vectors/data/{video_id}/features'])
    print("Loaded text modality data.")
    return text_data

# Load labels data
def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in f['/All Labels/data']:
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
text_data = load_text_data(TEXT_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

# Prepare data and labels for training
sequences = [text_data[vid] for vid in text_data.keys() if vid in labels]
target_labels = [labels[vid] for vid in text_data.keys() if vid in labels]

# Pad sequences to uniform length
max_sequence_length = 300
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Flatten target_labels by selecting only the first entry for each label (if necessary)
# Modify this as per your requirements to select relevant labels
target_labels = [label[0] if len(label.shape) > 1 else label for label in target_labels]

# Convert to numpy arrays before tensor conversion for efficiency
padded_sequences = np.array(padded_sequences)
target_labels = np.array(target_labels)

# Convert to PyTorch tensors
padded_sequences = torch.tensor(padded_sequences, dtype=torch.float32)
target_labels = torch.tensor(target_labels, dtype=torch.float32)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, target_labels, test_size=0.2, random_state=42)

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create data loaders for training and testing
batch_size = 32
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set up device for CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Transfer data to CUDA
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

# Verify CUDA loading
print("Sample batch from train_loader:")
for data, labels in train_loader:
    data, labels = data.to(device), labels.to(device)
    print("Data batch shape:", data.shape)
    print("Label batch shape:", labels.shape)
    break

2024-11-08 12:34:17.241721: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 12:34:17.791658: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731049457.991420    6321 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731049458.058128    6321 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 12:34:18.497102: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loaded text modality data.
Loaded label data.
Using device: cuda
Sample batch from train_loader:
Data batch shape: torch.Size([32, 300, 300])
Label batch shape: torch.Size([32, 7])


In [9]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler  # Removed autocast to avoid instability
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
TEXT_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_TimestampedWordVectors.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and prepare data
def load_text_data(csd_path):
    text_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/glove_vectors/data'])[:200]:  # Load more samples if available
            text_data[video_id] = np.array(f[f'/glove_vectors/data/{video_id}/features'])
    print("Loaded text modality data.")
    return text_data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:200]:  # Load more samples if available
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
text_data = load_text_data(TEXT_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

video_ids = [vid for vid in text_data.keys() if vid in labels]
sequences = [text_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad sequences
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
padded_sequences = torch.tensor(np.array(padded_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.2, random_state=42)

class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

batch_size = 8
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define a GRU model with higher dropout
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SentimentModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.6)  # Increased dropout
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.dropout(out[:, -1, :])  # Last GRU time step
        out = self.fc(out)
        return out

input_size = 300
hidden_size = 32  # Reduced hidden size
num_layers = 1  # Single-layer GRU
model = SentimentModel(input_size, hidden_size, num_layers).to(device)

# Class weights for imbalanced data
pos_weight = torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device) if counts[1] else torch.tensor([1.0]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)  # Added weight decay for L2 regularization
scaler = GradScaler()

# Early stopping
best_f1 = 0
early_stop_counter = 0
early_stop_patience = 3

# Training loop without autocast for stability
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation with F1-score calculation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).squeeze()
    all_labels = torch.cat(all_labels).squeeze()
    binary_preds = (all_preds >= 0.5).float()
    accuracy = accuracy_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)
    print(f"Test Accuracy: {accuracy * 100:.2f}%, Test F1-Score: {f1 * 100:.2f}%")

    # Early stopping based on F1-score
    if f1 > best_f1:
        best_f1 = f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print("Early stopping triggered.")
        break

# Final evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        all_preds.append(torch.sigmoid(outputs).cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds).squeeze()
all_labels = torch.cat(all_labels).squeeze()
binary_preds = (all_preds >= 0.5).float()
final_accuracy = accuracy_score(all_labels, binary_preds)
final_f1 = f1_score(all_labels, binary_preds)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Test F1-Score: {final_f1 * 100:.2f}%")


Loaded text modality data.
Loaded label data.
Label distribution: {0.0: 96, 1.0: 48}
Using device: cuda
Epoch [1/20], Loss: 0.9884
Test Accuracy: 72.41%, Test F1-Score: 20.00%


  scaler = GradScaler()


Epoch [2/20], Loss: 0.9568
Test Accuracy: 68.97%, Test F1-Score: 30.77%
Epoch [3/20], Loss: 0.9305
Test Accuracy: 68.97%, Test F1-Score: 30.77%
Epoch [4/20], Loss: 0.9630
Test Accuracy: 62.07%, Test F1-Score: 26.67%
Epoch [5/20], Loss: 0.9325
Test Accuracy: 62.07%, Test F1-Score: 26.67%
Early stopping triggered.
Final Test Accuracy: 62.07%
Final Test F1-Score: 26.67%


In [1]:
import torch
print(f"Is CUDA available? {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current GPU: {torch.cuda.current_device()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")

Is CUDA available? True
Number of GPUs: 1
Current GPU: 0
GPU name: NVIDIA GeForce RTX 3050 Ti Laptop GPU
Total GPU memory: 4.00 GB


Audio - unimodal

In [16]:
import h5py

# Path to the audio CSD file
AUDIO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_COVAREP.csd"

# Open the file and print its structure
with h5py.File(AUDIO_CSD_PATH, 'r') as f:
    def print_structure(name, obj):
        print(name)
    f.visititems(print_structure)

COVAREP
COVAREP/data
COVAREP/data/--qXJuDtHPw
COVAREP/data/--qXJuDtHPw/features
COVAREP/data/--qXJuDtHPw/intervals
COVAREP/data/-3g5yACwYnA
COVAREP/data/-3g5yACwYnA/features
COVAREP/data/-3g5yACwYnA/intervals
COVAREP/data/-3nNcZdcdvU
COVAREP/data/-3nNcZdcdvU/features
COVAREP/data/-3nNcZdcdvU/intervals
COVAREP/data/-571d8cVauQ
COVAREP/data/-571d8cVauQ/features
COVAREP/data/-571d8cVauQ/intervals
COVAREP/data/-6rXp3zJ3kc
COVAREP/data/-6rXp3zJ3kc/features
COVAREP/data/-6rXp3zJ3kc/intervals
COVAREP/data/-9YyBTjo1zo
COVAREP/data/-9YyBTjo1zo/features
COVAREP/data/-9YyBTjo1zo/intervals
COVAREP/data/-9y-fZ3swSY
COVAREP/data/-9y-fZ3swSY/features
COVAREP/data/-9y-fZ3swSY/intervals
COVAREP/data/-AUZQgSxyPQ
COVAREP/data/-AUZQgSxyPQ/features
COVAREP/data/-AUZQgSxyPQ/intervals
COVAREP/data/-Alixo7euuU
COVAREP/data/-Alixo7euuU/features
COVAREP/data/-Alixo7euuU/intervals
COVAREP/data/-Eqdz5y4pEY
COVAREP/data/-Eqdz5y4pEY/features
COVAREP/data/-Eqdz5y4pEY/intervals
COVAREP/data/-HeZS2-Prhc
COVAREP/data/-

In [21]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
AUDIO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_COVAREP.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and prepare audio data with normalization
def load_audio_data(csd_path):
    audio_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/COVAREP/data'])[:500]:  # Load more samples if available
            features = np.array(f[f'/COVAREP/data/{video_id}/features'])
            # Replace inf and NaN values and clip to avoid extremely high values
            features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
            features = np.clip(features, -1e6, 1e6)
            # Normalize each feature vector
            features = StandardScaler().fit_transform(features)
            audio_data[video_id] = features
    print("Loaded and normalized audio modality data.")
    return audio_data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:500]:  # Load more samples if available
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
audio_data = load_audio_data(AUDIO_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

# Prepare audio data for training
video_ids = [vid for vid in audio_data.keys() if vid in labels]
sequences = [audio_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad audio sequences to a fixed length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
padded_sequences = torch.tensor(np.array(padded_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.2, random_state=42)

# Custom Dataset for audio data
class AudioDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create datasets and data loaders
batch_size = 8
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

# Balanced sampling with WeightedRandomSampler
class_counts = np.bincount(y_train.int().numpy())
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[y_train.int()]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_dataset), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define GRU model with Batch Normalization
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SentimentModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.batch_norm = nn.BatchNorm1d(hidden_size)  # Batch normalization layer
        self.dropout = nn.Dropout(0.6)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.batch_norm(out[:, -1, :])  # Apply batch normalization
        out = self.dropout(out)
        out = self.fc(out)
        return out

input_size = padded_sequences.shape[2]  # Set to the number of audio features
hidden_size = 128  # Increased hidden size
num_layers = 2     # Two-layer GRU for improved sequence representation
model = SentimentModel(input_size, hidden_size, num_layers).to(device)

# Loss function with class weights for imbalance
pos_weight = torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device) if counts[1] else torch.tensor([1.0]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
scaler = GradScaler()

# Early stopping
best_f1 = 0
early_stop_counter = 0
early_stop_patience = 3

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation with adjusted threshold
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).squeeze()
    all_labels = torch.cat(all_labels).squeeze()
    binary_preds = (all_preds >= 0.4).float()  # Using threshold 0.4 instead of 0.5
    accuracy = accuracy_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)
    print(f"Test Accuracy: {accuracy * 100:.2f}%, Test F1-Score: {f1 * 100:.2f}%")

    # Early stopping based on F1-score
    if f1 > best_f1:
        best_f1 = f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print("Early stopping triggered.")
        break

# Final evaluation with threshold 0.4
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        all_preds.append(torch.sigmoid(outputs).cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds).squeeze()
all_labels = torch.cat(all_labels).squeeze()
binary_preds = (all_preds >= 0.4).float()
final_accuracy = accuracy_score(all_labels, binary_preds)
final_f1 = f1_score(all_labels, binary_preds)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Test F1-Score: {final_f1 * 100:.2f}%")

Loaded and normalized audio modality data.
Loaded label data.
Label distribution: {0.0: 207, 1.0: 99}
Using device: cuda


  scaler = GradScaler()


Epoch [1/20], Loss: 1.2304
Test Accuracy: 37.10%, Test F1-Score: 49.35%
Epoch [2/20], Loss: 1.1560
Test Accuracy: 41.94%, Test F1-Score: 50.00%
Epoch [3/20], Loss: 1.0096
Test Accuracy: 40.32%, Test F1-Score: 49.32%
Epoch [4/20], Loss: 1.0558
Test Accuracy: 45.16%, Test F1-Score: 51.43%
Epoch [5/20], Loss: 0.9802
Test Accuracy: 45.16%, Test F1-Score: 51.43%
Epoch [6/20], Loss: 1.0572
Test Accuracy: 43.55%, Test F1-Score: 47.76%
Epoch [7/20], Loss: 1.0221
Test Accuracy: 45.16%, Test F1-Score: 52.78%
Epoch [8/20], Loss: 0.9501
Test Accuracy: 45.16%, Test F1-Score: 52.78%
Epoch [9/20], Loss: 1.0541
Test Accuracy: 43.55%, Test F1-Score: 50.70%
Epoch [10/20], Loss: 0.9291
Test Accuracy: 45.16%, Test F1-Score: 50.00%
Early stopping triggered.
Final Test Accuracy: 45.16%
Final Test F1-Score: 50.00%


In [22]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
AUDIO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_COVAREP.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and prepare audio data with normalization
def load_audio_data(csd_path):
    audio_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/COVAREP/data'])[:500]:  # Load more samples if available
            features = np.array(f[f'/COVAREP/data/{video_id}/features'])
            features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
            features = np.clip(features, -1e6, 1e6)
            features = StandardScaler().fit_transform(features)
            audio_data[video_id] = features
    print("Loaded and normalized audio modality data.")
    return audio_data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:500]:  # Load more samples if available
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
audio_data = load_audio_data(AUDIO_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

# Prepare audio data for training
video_ids = [vid for vid in audio_data.keys() if vid in labels]
sequences = [audio_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad audio sequences to a fixed length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
padded_sequences = torch.tensor(np.array(padded_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.2, random_state=42)

# Custom Dataset for audio data
class AudioDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create datasets and data loaders
batch_size = 8
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

# Balanced sampling with WeightedRandomSampler
class_counts = np.bincount(y_train.int().numpy())
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[y_train.int()]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_dataset), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define GRU model with additional Dropout
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate=0.6):
        super(SentimentModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.batch_norm = nn.BatchNorm1d(hidden_size)  # Batch normalization layer
        self.dropout1 = nn.Dropout(dropout_rate)       # First dropout layer
        self.dropout2 = nn.Dropout(dropout_rate)       # Additional dropout layer
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.batch_norm(out[:, -1, :])  # Apply batch normalization
        out = self.dropout1(out)
        out = self.dropout2(out)
        out = self.fc(out)
        return out

input_size = padded_sequences.shape[2]  # Set to the number of audio features
hidden_size = 128  # Increased hidden size
num_layers = 2     # Two-layer GRU for improved sequence representation
model = SentimentModel(input_size, hidden_size, num_layers).to(device)

# Loss function with class weights for imbalance
pos_weight = torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device) if counts[1] else torch.tensor([1.0]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
scaler = GradScaler()

# Early stopping and threshold adjustment
best_f1 = 0
best_threshold = 0.4
early_stop_counter = 0
early_stop_patience = 3

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation and threshold tuning
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).squeeze()
    all_labels = torch.cat(all_labels).squeeze()
    
    # Tune threshold to find the best F1-score
    thresholds = np.arange(0.3, 0.6, 0.05)
    best_epoch_f1 = 0
    for threshold in thresholds:
        binary_preds = (all_preds >= threshold).float()
        f1 = f1_score(all_labels, binary_preds)
        if f1 > best_epoch_f1:
            best_epoch_f1 = f1
            best_threshold = threshold
    
    binary_preds = (all_preds >= best_threshold).float()
    accuracy = accuracy_score(all_labels, binary_preds)
    print(f"Best threshold for epoch {epoch+1}: {best_threshold:.2f}")
    print(f"Test Accuracy: {accuracy * 100:.2f}%, Test F1-Score: {best_epoch_f1 * 100:.2f}%")

    # Early stopping based on F1-score
    if best_epoch_f1 > best_f1:
        best_f1 = best_epoch_f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print("Early stopping triggered.")
        break

# Final evaluation with the best threshold
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        all_preds.append(torch.sigmoid(outputs).cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds).squeeze()
all_labels = torch.cat(all_labels).squeeze()
binary_preds = (all_preds >= best_threshold).float()
final_accuracy = accuracy_score(all_labels, binary_preds)
final_f1 = f1_score(all_labels, binary_preds)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Test F1-Score: {final_f1 * 100:.2f}%")

Loaded and normalized audio modality data.
Loaded label data.
Label distribution: {0.0: 207, 1.0: 99}
Using device: cuda


  scaler = GradScaler()


Epoch [1/20], Loss: 1.5111
Best threshold for epoch 1: 0.40
Test Accuracy: 45.16%, Test F1-Score: 51.43%
Epoch [2/20], Loss: 1.4233
Best threshold for epoch 2: 0.35
Test Accuracy: 50.00%, Test F1-Score: 53.73%
Epoch [3/20], Loss: 1.2873
Best threshold for epoch 3: 0.35
Test Accuracy: 51.61%, Test F1-Score: 55.88%
Epoch [4/20], Loss: 1.3403
Best threshold for epoch 4: 0.35
Test Accuracy: 51.61%, Test F1-Score: 55.88%
Epoch [5/20], Loss: 1.2838
Best threshold for epoch 5: 0.35
Test Accuracy: 51.61%, Test F1-Score: 55.88%
Epoch [6/20], Loss: 1.4139
Best threshold for epoch 6: 0.40
Test Accuracy: 53.23%, Test F1-Score: 55.38%
Early stopping triggered.
Final Test Accuracy: 53.23%
Final Test F1-Score: 55.38%


Video-unimodal

In [26]:
import h5py

VIDEO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_VisualFacet42.csd"

# Inspect the structure of the video data file
with h5py.File(VIDEO_CSD_PATH, 'r') as f:
    def print_structure(name, obj):
        print(name, obj)
    f.visititems(print_structure)

FACET 4.2 <HDF5 group "/FACET 4.2" (2 members)>
FACET 4.2/data <HDF5 group "/FACET 4.2/data" (3837 members)>
FACET 4.2/data/--qXJuDtHPw <HDF5 group "/FACET 4.2/data/--qXJuDtHPw" (2 members)>
FACET 4.2/data/--qXJuDtHPw/features <HDF5 dataset "features": shape (1715, 35), type "<f4">
FACET 4.2/data/--qXJuDtHPw/intervals <HDF5 dataset "intervals": shape (1715, 2), type "<f8">
FACET 4.2/data/-3g5yACwYnA <HDF5 group "/FACET 4.2/data/-3g5yACwYnA" (2 members)>
FACET 4.2/data/-3g5yACwYnA/features <HDF5 dataset "features": shape (4340, 35), type "<f4">
FACET 4.2/data/-3g5yACwYnA/intervals <HDF5 dataset "intervals": shape (4340, 2), type "<f8">
FACET 4.2/data/-3nNcZdcdvU <HDF5 group "/FACET 4.2/data/-3nNcZdcdvU" (2 members)>
FACET 4.2/data/-3nNcZdcdvU/features <HDF5 dataset "features": shape (1328, 35), type "<f4">
FACET 4.2/data/-3nNcZdcdvU/intervals <HDF5 dataset "intervals": shape (1328, 2), type "<f8">
FACET 4.2/data/-571d8cVauQ <HDF5 group "/FACET 4.2/data/-571d8cVauQ" (2 members)>
FACET 4.

In [28]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
VIDEO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_VisualFacet42.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and preprocess video data
def load_video_data(csd_path):
    video_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/FACET 4.2/data'])[:500]:  # Load a subset if desired
            features = np.array(f[f'/FACET 4.2/data/{video_id}/features'])
            features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
            features = np.clip(features, -1e6, 1e6)
            features = StandardScaler().fit_transform(features)
            video_data[video_id] = features
    print("Loaded and normalized video modality data.")
    return video_data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:500]:
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
video_data = load_video_data(VIDEO_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

# Prepare video data for training
video_ids = [vid for vid in video_data.keys() if vid in labels]
sequences = [video_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad video sequences to a fixed length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
padded_sequences = torch.tensor(np.array(padded_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.2, random_state=42)

class VideoDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Data loaders with balanced sampling
batch_size = 8
train_dataset = VideoDataset(X_train, y_train)
test_dataset = VideoDataset(X_test, y_test)
class_counts = np.bincount(y_train.int().numpy())
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[y_train.int()]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_dataset), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define Bidirectional GRU model with a larger hidden size
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate=0.6):
        super(SentimentModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.batch_norm = nn.BatchNorm1d(hidden_size * 2)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Adjusted for bidirectional GRU

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.batch_norm(out[:, -1, :])
        out = self.dropout1(out)
        out = self.fc(out)
        return out

input_size = padded_sequences.shape[2]
hidden_size = 256
num_layers = 2

# Initialize ensemble models
num_models = 3
models = [SentimentModel(input_size, hidden_size, num_layers).to(device) for _ in range(num_models)]
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device))
optimizer = [torch.optim.AdamW(model.parameters(), lr=0.00003, weight_decay=0.01) for model in models]
scalers = [GradScaler() for _ in range(num_models)]
best_thresholds = [0.4] * num_models

# Train each model
num_epochs = 20
for model_idx, model in enumerate(models):
    best_f1 = 0
    early_stop_counter = 0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            loss = criterion(outputs, labels)
            scalers[model_idx].scale(loss).backward()
            scalers[model_idx].step(optimizer[model_idx])
            scalers[model_idx].update()
            optimizer[model_idx].zero_grad()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Model {model_idx + 1}, Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

        # Validation with threshold tuning
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for data, labels in test_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                all_preds.append(torch.sigmoid(outputs).cpu())
                all_labels.append(labels.cpu())

        all_preds = torch.cat(all_preds).squeeze()
        all_labels = torch.cat(all_labels).squeeze()

        thresholds = np.arange(0.3, 0.6, 0.05)
        best_epoch_f1 = 0
        for threshold in thresholds:
            binary_preds = (all_preds >= threshold).float()
            f1 = f1_score(all_labels, binary_preds)
            if f1 > best_epoch_f1:
                best_epoch_f1 = f1
                best_thresholds[model_idx] = threshold
        
        print(f"Best threshold for model {model_idx + 1}, epoch {epoch+1}: {best_thresholds[model_idx]:.2f}")

        if best_epoch_f1 > best_f1:
            best_f1 = best_epoch_f1
            early_stop_counter = 0
        else:
            early_stop_counter += 1

        if early_stop_counter >= 3:
            print(f"Early stopping triggered for model {model_idx + 1}.")
            break

# Ensemble Evaluation
all_ensemble_preds = []
with torch.no_grad():
    for data, labels in test_loader:
        data = data.to(device)
        batch_size = data.size(0)
        
        # Initialize ensemble_preds to accumulate predictions
        ensemble_preds = torch.zeros(batch_size, 1, device="cpu")
        
        for model_idx, model in enumerate(models):
            outputs = torch.sigmoid(model(data)).cpu()
            binary_preds = (outputs >= best_thresholds[model_idx]).float()
            ensemble_preds += binary_preds
        
        # Average predictions from all models
        ensemble_preds /= num_models
        all_ensemble_preds.append(ensemble_preds)

all_ensemble_preds = torch.cat(all_ensemble_preds).squeeze()
binary_ensemble_preds = (all_ensemble_preds >= 0.5).float()
final_accuracy = accuracy_score(y_test, binary_ensemble_preds)
final_f1 = f1_score(y_test, binary_ensemble_preds)
print(f"Final Ensemble Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Ensemble Test F1-Score: {final_f1 * 100:.2f}%")

Loaded and normalized video modality data.
Loaded label data.
Label distribution: {0.0: 207, 1.0: 99}
Using device: cuda


  scalers = [GradScaler() for _ in range(num_models)]


Model 1, Epoch [1/20], Loss: 1.2255
Best threshold for model 1, epoch 1: 0.30
Model 1, Epoch [2/20], Loss: 1.1465
Best threshold for model 1, epoch 2: 0.40
Model 1, Epoch [3/20], Loss: 1.0576
Best threshold for model 1, epoch 3: 0.50
Model 1, Epoch [4/20], Loss: 1.1142
Best threshold for model 1, epoch 4: 0.35
Model 1, Epoch [5/20], Loss: 1.1108
Best threshold for model 1, epoch 5: 0.45
Model 1, Epoch [6/20], Loss: 1.1309
Best threshold for model 1, epoch 6: 0.50
Model 1, Epoch [7/20], Loss: 1.0796
Best threshold for model 1, epoch 7: 0.50
Model 1, Epoch [8/20], Loss: 1.0155
Best threshold for model 1, epoch 8: 0.40
Model 1, Epoch [9/20], Loss: 0.9957
Best threshold for model 1, epoch 9: 0.50
Early stopping triggered for model 1.
Model 2, Epoch [1/20], Loss: 1.2120
Best threshold for model 2, epoch 1: 0.50
Model 2, Epoch [2/20], Loss: 1.1351
Best threshold for model 2, epoch 2: 0.40
Model 2, Epoch [3/20], Loss: 1.0604
Best threshold for model 2, epoch 3: 0.40
Model 2, Epoch [4/20], Los

Audio

In [30]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
AUDIO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_COVAREP.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and prepare audio data with normalization
def load_audio_data(csd_path):
    audio_data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/COVAREP/data'])[:500]:  # Load more samples if available
            features = np.array(f[f'/COVAREP/data/{video_id}/features'])
            features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
            features = np.clip(features, -1e6, 1e6)
            features = StandardScaler().fit_transform(features)
            audio_data[video_id] = features
    print("Loaded and normalized audio modality data.")
    return audio_data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:500]:  # Load more samples if available
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
audio_data = load_audio_data(AUDIO_CSD_PATH)
labels = load_labels(LABEL_CSD_PATH)

# Prepare audio data for training
video_ids = [vid for vid in audio_data.keys() if vid in labels]
sequences = [audio_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad audio sequences to a fixed length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
padded_sequences = torch.tensor(np.array(padded_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.2, random_state=42)

# Custom Dataset for audio data
class AudioDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create datasets and data loaders
batch_size = 8
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

# Balanced sampling with WeightedRandomSampler
class_counts = np.bincount(y_train.int().numpy())
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[y_train.int()]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_dataset), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define GRU model with Batch Normalization
class SentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SentimentModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.batch_norm = nn.BatchNorm1d(hidden_size)  # Batch normalization layer
        self.dropout = nn.Dropout(0.6)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.batch_norm(out[:, -1, :])  # Apply batch normalization
        out = self.dropout(out)
        out = self.fc(out)
        return out

input_size = padded_sequences.shape[2]  # Set to the number of audio features
hidden_size = 128  # Increased hidden size
num_layers = 2     # Two-layer GRU for improved sequence representation
model = SentimentModel(input_size, hidden_size, num_layers).to(device)

# Loss function with class weights for imbalance
pos_weight = torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device) if counts[1] else torch.tensor([1.0]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
scaler = GradScaler()

# Early stopping
best_f1 = 0
early_stop_counter = 0
early_stop_patience = 3
train_accuracies = []  # To store training accuracies for each epoch

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train_predictions = 0
    total_train_samples = 0
    
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        total_loss += loss.item()

        # Calculate training accuracy
        train_preds = (torch.sigmoid(outputs) >= 0.5).float()
        correct_train_predictions += (train_preds == labels).sum().item()
        total_train_samples += labels.size(0)

    # Calculate average training loss and accuracy
    avg_loss = total_loss / len(train_loader)
    train_accuracy = correct_train_predictions / total_train_samples
    train_accuracies.append(train_accuracy * 100)  # Store accuracy percentage
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy * 100:.2f}%")

    # Validation with adjusted threshold
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds).squeeze()
    all_labels = torch.cat(all_labels).squeeze()
    binary_preds = (all_preds >= 0.4).float()  # Using threshold 0.4 instead of 0.5
    accuracy = accuracy_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)
    print(f"Test Accuracy: {accuracy * 100:.2f}%, Test F1-Score: {f1 * 100:.2f}%")

    # Early stopping based on F1-score
    if f1 > best_f1:
        best_f1 = f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print("Early stopping triggered.")
        break

# Final evaluation with threshold 0.4
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for data, labels in test_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = model(data)
        all_preds.append(torch.sigmoid(outputs).cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds).squeeze()
all_labels = torch.cat(all_labels).squeeze()
binary_preds = (all_preds >= 0.4).float()
final_accuracy = accuracy_score(all_labels, binary_preds)
final_f1 = f1_score(all_labels, binary_preds)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Test F1-Score: {final_f1 * 100:.2f}%")

Loaded and normalized audio modality data.
Loaded label data.
Label distribution: {0.0: 207, 1.0: 99}
Using device: cuda


  scaler = GradScaler()


Epoch [1/20], Loss: 1.1995, Training Accuracy: 52.87%
Test Accuracy: 41.94%, Test F1-Score: 52.63%
Epoch [2/20], Loss: 1.0889, Training Accuracy: 56.97%
Test Accuracy: 56.45%, Test F1-Score: 57.14%
Epoch [3/20], Loss: 1.0575, Training Accuracy: 57.38%
Test Accuracy: 56.45%, Test F1-Score: 57.14%
Epoch [4/20], Loss: 1.1348, Training Accuracy: 57.79%
Test Accuracy: 56.45%, Test F1-Score: 57.14%
Epoch [5/20], Loss: 1.0105, Training Accuracy: 60.66%
Test Accuracy: 56.45%, Test F1-Score: 57.14%
Early stopping triggered.
Final Test Accuracy: 56.45%
Final Test F1-Score: 57.14%


Multimodal

In [38]:
import numpy as np
import h5py
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
from torch.cuda.amp import GradScaler
from sklearn.metrics import accuracy_score, f1_score

# Paths to data files
TEXT_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_TimestampedWordVectors.csd"
AUDIO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_COVAREP.csd"
VIDEO_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_VisualFacet42.csd"
LABEL_CSD_PATH = "/mnt/c/Users/yogin/Desktop/HCI_Multimodal-main/data/CMU_MOSEI_Labels.csd"

# Load and prepare modality data with normalization
def load_data(csd_path, modality):
    data = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f[f'/{modality}/data'])[:500]:  # Load a subset if desired
            features = np.array(f[f'/{modality}/data/{video_id}/features'])
            features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
            features = StandardScaler().fit_transform(features)
            data[video_id] = features
    print(f"Loaded and normalized {modality} modality data.")
    return data

def load_labels(csd_path):
    labels = {}
    with h5py.File(csd_path, 'r') as f:
        for video_id in list(f['/All Labels/data'])[:500]:  # Load a subset if desired
            labels[video_id] = f[f'/All Labels/data/{video_id}/features'][()]
    print("Loaded label data.")
    return labels

# Load data
text_data = load_data(TEXT_CSD_PATH, 'glove_vectors')
audio_data = load_data(AUDIO_CSD_PATH, 'COVAREP')
video_data = load_data(VIDEO_CSD_PATH, 'FACET 4.2')
labels = load_labels(LABEL_CSD_PATH)

# Prepare multimodal data
video_ids = [vid for vid in text_data.keys() if vid in audio_data and vid in video_data and vid in labels]
text_sequences = [text_data[vid] for vid in video_ids]
audio_sequences = [audio_data[vid] for vid in video_ids]
video_sequences = [video_data[vid] for vid in video_ids]
target_labels = [labels[vid] for vid in video_ids]

# Pad sequences to fixed length
max_sequence_length = 100
text_sequences = pad_sequences(text_sequences, maxlen=max_sequence_length, dtype='float32', padding='post')
audio_sequences = pad_sequences(audio_sequences, maxlen=max_sequence_length, dtype='float32', padding='post')
video_sequences = pad_sequences(video_sequences, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to binary
processed_labels = []
for label_array in target_labels:
    avg_label = np.mean(label_array[:, 0]) if label_array.ndim > 1 else label_array[0]
    binary_label = 1.0 if avg_label >= 0.5 else 0.0
    processed_labels.append(binary_label)

unique, counts = np.unique(processed_labels, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))

# Convert data to tensors
text_sequences = torch.tensor(np.array(text_sequences), dtype=torch.float32)
audio_sequences = torch.tensor(np.array(audio_sequences), dtype=torch.float32)
video_sequences = torch.tensor(np.array(video_sequences), dtype=torch.float32)
processed_labels = torch.tensor(np.array(processed_labels), dtype=torch.float32)

# Train-test split
X_train_text, X_test_text, X_train_audio, X_test_audio, X_train_video, X_test_video, y_train, y_test = train_test_split(
    text_sequences, audio_sequences, video_sequences, processed_labels, test_size=0.2, random_state=42
)

class MultimodalDataset(Dataset):
    def __init__(self, text_data, audio_data, video_data, labels):
        self.text_data = text_data
        self.audio_data = audio_data
        self.video_data = video_data
        self.labels = labels.unsqueeze(1)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.text_data[idx], self.audio_data[idx], self.video_data[idx], self.labels[idx]

# Data Loaders
batch_size = 8
train_dataset = MultimodalDataset(X_train_text, X_train_audio, X_train_video, y_train)
test_dataset = MultimodalDataset(X_test_text, X_test_audio, X_test_video, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

import torch
import torch.nn as nn

class MultimodalSentimentModel(nn.Module):
    def __init__(self, text_input_size, audio_input_size, video_input_size, hidden_size, num_layers):
        super(MultimodalSentimentModel, self).__init__()
        
        # Text model (GloVe embedding, 300 dimensions)
        self.text_gru = nn.GRU(text_input_size, hidden_size, num_layers, batch_first=True)
        
        # Audio model (COVAREP features, 74 dimensions)
        self.audio_gru = nn.GRU(audio_input_size, hidden_size, num_layers, batch_first=True)
        
        # Video model (Facet 4.2 features, 35 dimensions)
        self.video_gru = nn.GRU(video_input_size, hidden_size, num_layers, batch_first=True)
        
        # Fusion and classification layers
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size * 3, 1)  # Combine the hidden states from all three modalities

    def forward(self, text, audio, video):
        # Text processing
        text_out, _ = self.text_gru(text)
        text_out = text_out[:, -1, :]  # Last time step
        
        # Audio processing
        audio_out, _ = self.audio_gru(audio)
        audio_out = audio_out[:, -1, :]  # Last time step
        
        # Video processing
        video_out, _ = self.video_gru(video)
        video_out = video_out[:, -1, :]  # Last time step

        # Concatenate the hidden states from all modalities
        fused = torch.cat((text_out, audio_out, video_out), dim=1)
        fused = self.dropout(fused)
        out = self.fc(fused)
        
        return out


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
text_input_size = 300  # For GloVe embeddings
audio_input_size = 74  # For COVAREP audio features
video_input_size = 35  # For Facet 4.2 video features
hidden_size = 128
num_layers = 2

model = MultimodalSentimentModel(text_input_size, audio_input_size, video_input_size, hidden_size, num_layers).to(device)


# Loss function with class weights
pos_weight = torch.tensor([counts[0] / counts[1]], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
scaler = GradScaler()

# Training loop with training accuracy
num_epochs = 20
best_f1 = 0
best_threshold = 0.4
early_stop_counter = 0
early_stop_patience = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for text, audio, video, labels in train_loader:
        text, audio, video, labels = text.to(device), audio.to(device), video.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(text, audio, video)
        loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Calculate train accuracy and F1
    model.eval()
    train_preds, train_labels = [], []
    with torch.no_grad():
        for text, audio, video, labels in train_loader:
            text, audio, video, labels = text.to(device), audio.to(device), video.to(device), labels.to(device)
            outputs = model(text, audio, video)
            train_preds.append(torch.sigmoid(outputs).cpu())
            train_labels.append(labels.cpu())
    
    train_preds = torch.cat(train_preds).squeeze()
    train_labels = torch.cat(train_labels).squeeze()
    binary_train_preds = (train_preds >= 0.5).float()
    train_accuracy = accuracy_score(train_labels, binary_train_preds)
    train_f1 = f1_score(train_labels, binary_train_preds)
    print(f"Train Accuracy: {train_accuracy * 100:.2f}%, Train F1-Score: {train_f1 * 100:.2f}%")

    # Validation
    all_preds, all_labels = [], []
    with torch.no_grad():
        for text, audio, video, labels in test_loader:
            text, audio, video, labels = text.to(device), audio.to(device), video.to(device), labels.to(device)
            outputs = model(text, audio, video)
            all_preds.append(torch.sigmoid(outputs).cpu())
            all_labels.append(labels.cpu())
    
    all_preds = torch.cat(all_preds).squeeze()
    all_labels = torch.cat(all_labels).squeeze()
    binary_preds = (all_preds >= best_threshold).float()
    accuracy = accuracy_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)
    print(f"Test Accuracy: {accuracy * 100:.2f}%, Test F1-Score: {f1 * 100:.2f}%")

    # Tune threshold for best F1 on validation
    thresholds = np.arange(0.3, 0.6, 0.05)
    best_epoch_f1 = 0
    for threshold in thresholds:
        binary_preds = (all_preds >= threshold).float()
        epoch_f1 = f1_score(all_labels, binary_preds)
        if epoch_f1 > best_epoch_f1:
            best_epoch_f1 = epoch_f1
            best_threshold = threshold

    print(f"Best threshold for epoch {epoch+1}: {best_threshold:.2f}")
    print(f"Best Test F1-Score for epoch {epoch+1}: {best_epoch_f1 * 100:.2f}%")

    # Early stopping based on F1-score
    if best_epoch_f1 > best_f1:
        best_f1 = best_epoch_f1
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= early_stop_patience:
        print("Early stopping triggered.")
        break

# Final evaluation with best threshold
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for text, audio, video, labels in test_loader:
        text, audio, video, labels = text.to(device), audio.to(device), video.to(device), labels.to(device)
        outputs = model(text, audio, video)
        all_preds.append(torch.sigmoid(outputs).cpu())
        all_labels.append(labels.cpu())

all_preds = torch.cat(all_preds).squeeze()
all_labels = torch.cat(all_labels).squeeze()
binary_preds = (all_preds >= best_threshold).float()
final_accuracy = accuracy_score(all_labels, binary_preds)
final_f1 = f1_score(all_labels, binary_preds)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")
print(f"Final Test F1-Score: {final_f1 * 100:.2f}%")

# Save the model
torch.save(model.state_dict(), "multimodal_sentiment_model.pth")
print("Model saved as 'multimodal_sentiment_model.pth'")



Loaded and normalized glove_vectors modality data.
Loaded and normalized COVAREP modality data.
Loaded and normalized FACET 4.2 modality data.
Loaded label data.
Label distribution: {0.0: 207, 1.0: 99}


  scaler = GradScaler()


Epoch [1/20], Loss: 0.9291
Train Accuracy: 63.11%, Train F1-Score: 57.14%
Test Accuracy: 32.26%, Test F1-Score: 48.78%
Best threshold for epoch 1: 0.50
Best Test F1-Score for epoch 1: 56.67%
Epoch [2/20], Loss: 0.9181
Train Accuracy: 75.82%, Train F1-Score: 66.29%
Test Accuracy: 66.13%, Test F1-Score: 55.32%
Best threshold for epoch 2: 0.50
Best Test F1-Score for epoch 2: 55.32%
Epoch [3/20], Loss: 0.8943
Train Accuracy: 76.23%, Train F1-Score: 67.42%
Test Accuracy: 62.90%, Test F1-Score: 51.06%
Best threshold for epoch 3: 0.50
Best Test F1-Score for epoch 3: 51.06%
Epoch [4/20], Loss: 0.8808
Train Accuracy: 72.54%, Train F1-Score: 64.55%
Test Accuracy: 66.13%, Test F1-Score: 60.38%
Best threshold for epoch 4: 0.50
Best Test F1-Score for epoch 4: 60.38%
Epoch [5/20], Loss: 0.8717
Train Accuracy: 69.67%, Train F1-Score: 63.73%
Test Accuracy: 61.29%, Test F1-Score: 58.62%
Best threshold for epoch 5: 0.50
Best Test F1-Score for epoch 5: 58.62%
Epoch [6/20], Loss: 0.8311
Train Accuracy: 75

Audio