
**Loading** CMU-MOSI **dataset**

In [None]:
!git clone https://github.com/pliang279/MultiBench.git

Cloning into 'MultiBench'...
remote: Enumerating objects: 6943, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 6943 (delta 72), reused 121 (delta 60), pack-reused 6789[K
Receiving objects: 100% (6943/6943), 51.07 MiB | 22.08 MiB/s, done.
Resolving deltas: 100% (4258/4258), done.


In [None]:
%cd MultiBench

/content/MultiBench


## Run the following cell to download mosi_raw.pkl the data and accordingly add the path while loading the data

In [None]:
!mkdir data
!pip install gdown && gdown https://drive.google.com/u/0/uc?id=1szKIqO0t3Be_W91xvf6aYmsVVUa7wDHU

mkdir: cannot create directory ‘data’: File exists


In [None]:
import torch
import sys
import os

In [None]:
# Import the associated dataloader for affect datasets, which MOSI is a part of.
from datasets.affect.get_data import get_dataloader

# Create the training, validation, and test-set dataloaders.
traindata, validdata, testdata = get_dataloader(
    '/content/MultiBench/mosi_raw.pkl', robust_test=False, max_pad=True, data_type='mosi', max_seq_len=20)

----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F

class SubnetModel(nn.Module):
    def __init__(self,input_size,num_utterances,fc1_size, fc2_size,fc3_size):
        super(SubnetModel, self).__init__()

        self.drop = nn.Dropout(p=0.15)

        # Fully connected layers

        #fc1 gets hidden_size dimension values as input
        self.fc1 = nn.Linear(input_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, fc3_size)


        # Activation functions
        self.relu = nn.ReLU()

    def forward(self, x):

        x = torch.mean(x, dim=1)

        fc1_out = self.relu(self.fc1(x))
        drop1=self.drop(fc1_out)

        fc2_out = self.relu(self.fc2(drop1))
        drop2=self.drop(fc2_out)

        fc3_out=self.relu(self.fc3(drop2))
        drop3=self.drop(fc3_out)

        return drop3



In [None]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter

class TextModel(nn.Module):
    def __init__(self,input_size, hidden_size, num_layers,fc1_size, fc2_size):
        super(TextModel, self).__init__()

        # LSTM layer (stacked LSTM)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,batch_first=True)

        # Fully connected layers
        self.drop = nn.Dropout(p=0.15)

        #fc1 gets hidden_size dimension values as input
        self.fc1 = nn.Linear(hidden_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)

        # Activation functions
        self.relu = nn.ReLU()
        # self.sigmoid=nn.Sigmoid()

    def forward(self, x):


        # LSTM layer

        lstm_out, (hidden_states, cell_states) = self.lstm(x)


        fc1_out = self.relu(self.fc1(hidden_states.squeeze()))
        # drop1=self.drop(fc1_out)

        fc2_out = self.relu(self.fc2(fc1_out))
        # drop2=self.drop(fc2_out)

        return fc2_out


In [None]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from torch.autograd import Variable

class TFN(nn.Module):
    def __init__(self,audio_params,video_params,text_params,SIN_params):
        super(TFN, self).__init__()

        self.output_range = Parameter(torch.FloatTensor([6]), requires_grad=False)
        self.output_shift = Parameter(torch.FloatTensor([-3]), requires_grad=False)

        self.audio_params=audio_params
        self.video_params=video_params
        self.text_params=text_params

        #unimodels
        self.audio_subnet=SubnetModel(audio_params[0],audio_params[1],audio_params[2],audio_params[3],audio_params[4])
        self.video_subnet=SubnetModel(video_params[0],video_params[1],video_params[2],video_params[3],video_params[4])
        self.text_subnet=TextModel(text_params[0],text_params[1],text_params[2],text_params[3],text_params[4])

        # Fully connected layers

        self.drop = nn.Dropout(p=0.15)

        #fc1 gets hidden_size dimension values as input
        self.fc1 = nn.Linear(((audio_params[2]+1)*(video_params[2]+1)*(text_params[3]+1)), SIN_params[0])
        self.fc2 = nn.Linear(SIN_params[0], SIN_params[1])

        # Output layer
        self.output_layer = nn.Linear(SIN_params[1], 1)

        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid=nn.Sigmoid()

    def forward(self,x):

        DTYPE = torch.FloatTensor

        batch_size=x[0].shape[0]

        # unimodal outputs


        audio_out=self.audio_subnet(x[0])

        video_out=self.video_subnet(x[1])

        text_out=self.text_subnet(x[2])



        # adding 1 to increase the dimension value

        audio_out = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), audio_out), dim=1)
        # print("audio_out")
        # print(audio_out.shape)
        video_out = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), video_out), dim=1)
        # print("video_out")
        # print(video_out.shape)
        text_out = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), text_out), dim=1)
        # print("text_out")
        # print(text_out.shape)


        # tensorfusion operation

        fusion_tensor = torch.bmm(audio_out.unsqueeze(2), video_out.unsqueeze(1))

        fusion_tensor = fusion_tensor.view(-1, (self.audio_params[2] + 1) * (self.video_params[2] + 1), 1)

        fusion_tensor = torch.bmm(fusion_tensor, text_out.unsqueeze(1)).view(batch_size, -1)


        fc1_out = self.relu(self.fc1(fusion_tensor))
        drop1=self.drop(fc1_out)

        fc2_out = self.relu(self.fc2(drop1))
        drop2=self.drop(fc2_out)

        # Output layer with Sigmoid activation
        output = self.sigmoid(self.output_layer(drop2))

        # get output between -3 and +3
        output=(output*self.output_range)+self.output_shift

        return output

audio_out

torch.Size([32, 33])

then changed to torch.Size([32,33,1])

video_out

torch.Size([32, 33])

then changed to torch.Size([32,1,33])

text_out

torch.Size([32, 129])

fusion tensor 1

torch.Size([32, 33, 33])

fusion tensor 2

torch.Size([32, 1089, 1])

fusion tensor 3

first changed text_out to torch.Size([32,1,129]) then bmm with fusion tensor 2

we get torch.Size([32,1089,129]) which is then flattened to:

torch.Size([32, 140481]) to feed to fully connected network

which is torch.Size([32,129x33x33])

**-------------------------------------------------------------------------------------------------------------------------------------**

In [None]:
max_seq_len=50

audio_params=(35,max_seq_len,32,32,32) # (feature_length,max_seq_len,fc1_size,fc2_size,fc3_size)
video_params=(74,max_seq_len,32,32,32) # (feature_length,max_seq_len,fc1_size,fc2_size,fc3_size)
text_params=(300,128,1,128,128) # (feature_length,LSTM_hidden_size,num_LSTM_layers,fc1_size,fc2_size)

SIN_params=(128,128)

final_model=TFN(audio_params,video_params,text_params,SIN_params)

loading weights of saved model

In [None]:
# final_model.load_state_dict(torch.load('/content/drive/MyDrive/multi_model_SA/TFN_MAE-1.4939_50epochs.pth'))

<All keys matched successfully>

In [None]:
Loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(list(final_model.parameters())[2:],lr=5e-4,weight_decay=0.01)
num_epochs = 20
testdata=validdata

In [None]:
from sklearn.metrics import r2_score
import numpy as np
from sklearn.metrics import mean_absolute_error
import scipy.stats

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for epoch in range(num_epochs):

    print("EPOCH : ",epoch+1)
    # Training
    total_train_loss=0.0
    num_sequences=0
    all_predictions = []
    all_targets = []
    final_model.train()  # Set the model to training mode
    for batch in traindata:
        # targets=targets.unsqueeze(1).repeat(1, 50, 1)
        optimizer.zero_grad()  # Zero the gradients
        labels=batch[-1]
        outputs = final_model(batch[:-1])
        loss = Loss(outputs, batch[-1])

        total_train_loss+=loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        all_predictions.extend(outputs)
        all_targets.extend(labels)
        num_sequences+=1

    all_predictions = np.array([tensor.detach().numpy() for tensor in all_predictions])
    all_targets = np.array([tensor.detach().numpy() for tensor in all_targets])

    average_train_loss = total_train_loss / num_sequences
    train_mae= mean_absolute_error(all_targets, all_predictions)
    r = scipy.stats.pearsonr(all_targets.ravel(), all_predictions.ravel())
    print("-------------Training----------------")
    print(f'Epoch [{epoch + 1}/{num_epochs}],MAE:{train_mae:.4f} ,r:{r[0]:.4f}')

    # Validation
    final_model.eval()  # Set the model to evaluation mode
    total_val_loss = 0.0
    num_sequences=0
    val_all_predictions = []
    val_all_targets = []
    with torch.no_grad():
        best_val_loss = np.inf
        patience=3
        current_patience = patience

        for batch in validdata:
            val_targets=batch[-1]
            val_outputs = final_model(batch[:-1])
            val_loss = Loss(val_outputs, val_targets)
            total_val_loss += val_loss.item()


            val_all_predictions.extend(val_outputs)
            val_all_targets.extend(val_targets)
            num_sequences+=1

    average_val_loss = total_val_loss / num_sequences
    val_all_predictions=np.array(val_all_predictions)
    val_all_targets=np.array(val_all_targets)
    mae = mean_absolute_error(val_all_targets, val_all_predictions)
    val_r = scipy.stats.pearsonr(val_all_targets.ravel(), val_all_predictions.ravel())



    print("--------------Validation----------")
    print(f'Epoch [{epoch + 1}/{num_epochs}],mae: {mae:.4f},r:{val_r[0]:.4f}')




    print("\n \n")


EPOCH :  1
-------------Training----------------
Epoch [1/20],MAE:0.7249 ,r:0.7849
--------------Validation----------
Epoch [1/20],mae: 1.1092,r:0.5312

 

EPOCH :  2
-------------Training----------------
Epoch [2/20],MAE:0.7104 ,r:0.7957
--------------Validation----------
Epoch [2/20],mae: 1.0841,r:0.5393

 

EPOCH :  3


KeyboardInterrupt: 

In [None]:

from sklearn.metrics import mean_absolute_error

final_model.eval()

with torch.no_grad():
    all_predictions = []
    all_targets = []

    for batch in testdata:
        test_targets=batch[-1]
        test_outputs = final_model(batch[:-1])

        all_predictions.extend(test_outputs)
        all_targets.extend(test_targets)

    all_predictions=np.array(all_predictions)
    all_targets=np.array(all_targets)
    mae = mean_absolute_error(all_targets, all_predictions)
    r =  scipy.stats.pearsonr(all_targets.ravel(), all_predictions.ravel())

    print(f'Test MAE: {mae:.4f} , r:{r[0]:.4f}')


Test MAE: 1.1014 , r:0.5282


----------------------------------------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------------------------------------------

### **keeping everything according to paper:**

1) lr=0.0005 , weight_decay=0.01 , 100 epochs(50/100) - mae = 1.28 on val and 1.49 on test

### **not keeping everything same as paper:**

added dropout to text subnetwork along with others

1) lr=0.0005 , weight_decay=0.01 , 100 epochs(40/100) - mae = 1.33 on val and 1.54 on test

added dropout to text subnetwork and removed from sentiment

2) lr=0.0005 , weight_decay=0.01 , 100 epochs(40/100) - mae = 1.34 on val and 1.55 on test

# Saving the model:


In [None]:
torch.save(final_model.state_dict(), '/content/drive/MyDrive/multi_model_SA/TFN_regression')

In [None]:
torch.save(final_model.state_dict(), '/content/drive/MyDrive/multi_model_SA/TFN_MAE-1.4939_50epochs.pth')

------------------------------------------------------------