In [None]:
from mmsdk import mmdatasdk
import os

data_path = r"dataset"
os.makedirs(data_path, exist_ok=True)

features = {
    "covarep": data_path + "/CMU_MOSI_COVAREP.csd",
    "facet41": data_path + "/CMU_MOSI_Visual_Facet_41.csd",
    "facet42": data_path + "/CMU_MOSI_Visual_Facet_42.csd",
    "opensmile_eb10": data_path + "/CMU_MOSI_OpenSmile_EB10.csd",
    "words": data_path + "/CMU_MOSI_TimestampedWords.csd",
    "wordvec": data_path + "/CMU_MOSI_TimestampedWordVectors.csd",
    "labels": data_path + "/CMU_MOSI_Opinion_Labels.csd"
}
dataset = mmdatasdk.mmdataset(features)
dataset.align("labels")
print(dataset["covarep"].keys())

sample = list(dataset["covarep"].keys())[0]
print(dataset["covarep"][sample]['features'].shape)


In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

audio_tensors = [torch.tensor(x, dtype=torch.float32) for x in X_audio]
audio_padded = pad_sequence(audio_tensors, batch_first=True)

In [15]:
import torch
import torch.nn as nn


sample = list(dataset["labels"].keys())[0]
print(sample)

print(len(dataset["labels"].keys()))

# Audio features
audio_feat = torch.tensor(dataset["covarep"][sample]['features'], dtype=torch.float32)  # (T_audio, 74)
print(f"{audio_feat = }")


# Video features
video_feat = torch.tensor(dataset["facet41"][sample]['features'], dtype=torch.float32)  # (T_video, 41)
print(f"{video_feat = }")
      
# Text features
text_feat = torch.tensor(dataset["wordvec"][sample]['features'], dtype=torch.float32)  # (T_text, 300)
print(f"{text_feat = }")

      
# Segment label
label = torch.tensor(dataset["labels"][sample]['features'], dtype=torch.float32)  # (1,)
print(f"{label = }")


audio_lstm = nn.LSTM(input_size=74, hidden_size=128, batch_first=True, bidirectional=True)
video_lstm = nn.LSTM(input_size=47, hidden_size=128, batch_first=True, bidirectional=True)
text_lstm  = nn.LSTM(input_size=300, hidden_size=128, batch_first=True, bidirectional=True)

# Add batch dimension (batch=1)
audio_in = audio_feat.unsqueeze(0)  # (1, T_audio, 74)
video_in = video_feat.unsqueeze(0)  # (1, T_video, 41)
text_in  = text_feat.unsqueeze(0)   # (1, T_text, 300)

# Forward pass through LSTM
audio_out, _ = audio_lstm(audio_in)  # (1, T_audio, 256)
video_out, _ = video_lstm(video_in)  # (1, T_video, 256)
text_out,  _ = text_lstm(text_in)    # (1, T_text, 256)



def mean_pool(x):
    # x: (1, T, D)
    return x.mean(dim=1)  # (1, D)

audio_vec = mean_pool(audio_out)  # (1, 256)
video_vec = mean_pool(video_out)  # (1, 256)
text_vec  = mean_pool(text_out)   # (1, 256)


fused = torch.cat([audio_vec, video_vec, text_vec], dim=1)  # (1, 768)


mlp = nn.Sequential(
    nn.Linear(256*3, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    
    nn.Linear(128, 1)  # predict sentiment
)

y_pred = mlp(fused)
print("Predicted sentiment:", y_pred.item())
print("True label:", label.item())


03bSnISJMiM[0]
2183
audio_feat = tensor([[2.6000e+02, 1.0000e+00, 2.1585e-01,  ..., 4.3386e-01, 4.0485e-01,
         2.5323e-01],
        [2.5900e+02, 1.0000e+00, 1.0248e-01,  ..., 4.2773e-01, 4.1049e-01,
         2.4466e-01],
        [2.6300e+02, 1.0000e+00, 2.6963e-01,  ..., 4.1255e-01, 4.0917e-01,
         2.3894e-01],
        ...,
        [2.0550e+02, 1.0000e+00, 1.7049e-01,  ..., 6.4619e-02, 1.5919e-01,
         2.2397e-01],
        [2.0650e+02, 1.0000e+00, 2.2382e-01,  ..., 4.7717e-02, 1.7221e-01,
         2.4578e-01],
        [2.0700e+02, 1.0000e+00, 1.8770e-01,  ..., 3.5531e-02, 1.8578e-01,
         2.6337e-01]])
video_feat = tensor([[ 1.3100e+02,  4.1000e+01,  2.7700e+02,  ...,  2.2776e-01,
          1.1950e-01, -6.5578e-01],
        [ 1.3200e+02,  4.1000e+01,  2.7600e+02,  ...,  2.6919e-01,
          2.4537e-01, -5.9025e-01],
        [ 1.4000e+02,  4.1000e+01,  2.7900e+02,  ...,  3.6260e-01,
          4.3830e-01, -5.2462e-01],
        ...,
        [ 2.9100e+02,  6.7000e+01,  

In [16]:
segments = dataset["labels"].keys()

video_ids = set()

for seg in segments:
    vid = seg.split("[")[0]   # extract part before "["
    video_ids.add(vid)

print("Unique video count:", len(video_ids))
print("Video IDs:", video_ids)


Unique video count: 92
Video IDs: {'HEsqda8_d0Q', 'Njd1F0vZSm4', 'cW1FSBF59ik', 'cXypl4FnoZo', 'VbQk4H8hgr0', '73jzhE8R1TQ', 'G6GlGvlkxAQ', 'QN9ZIUWUXsY', 'rnaNMUZpvvg', 'dq3Nf_lMPnE', 'pLTX3ipuDJI', 'Qr1Ca94K55A', 'jUzDDGyPkXU', 'PZ-lDQFboO8', 'v0zCBqDeKcE', 'fvVhgmXxadc', 'vyB00TXsimI', 'tmZoasNr4rU', 'OtBXNcAL_lE', 'WKA5OygbEKI', 'BI97DNYfe5I', 'nbWiPyCm4g0', 'zhpQhgha_KU', '03bSnISJMiM', '7JsX8y1ysxY', '6_0THN4chvY', 'k5Y_838nuGo', 'IumbAb8q2dM', 'yDtzw_Y-7RU', 'G-xst2euQUc', '6Egk_28TtTM', 'POKffnXeBds', 'ob23OKe5a9Q', '5W7Z1C_fDaE', 'VCslbP0mgZI', 'OQvJTdtJ2H4', 'I5y0__X72p0', 'tIrG4oNLFzE', '8d-gEyoeBzc', 'atnd_PF-Lbs', 'cM3Yna7AavY', 'yvsjCA6Y5Fc', 'MLal-t_vJPM', 'nzpVDcQ0ywM', 'Clx4VXItLTE', 'bOL9jKpeJRs', 'vvZ4IcEtiZc', 'ZUXBRvtny7o', 'Iu2PFX3z_1s', '0h-zjBukYpk', 'Nzq88NnDkEk', 'iiK8YX8oH1E', 'Dg_0XKD0Mf4', '8OtFthrtaJM', 'tStelxIAHjw', 'aiEXnCPZubE', 'Vj1wYRQjB-o', '_dI--eQ6qVU', '9J25DZhivz8', '2iD-tVS8NPw', 'TvyZBvOMOTc', 'BvYR0L6f2Ig', 'd3_k5Xpfmik', 'Af8D0E4ZXaw', 'W8NX