In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch import optim

from sklearn.metrics import accuracy_score
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime

import warnings
warnings.filterwarnings('ignore')

# Added myself
import torchvision.transforms as tvt
import torchaudio.transforms as tat
import wandb
wandb.init()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

In [None]:
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, data_path, partition= "train"): 

        self.X_dir = data_path + "/" + "MFCCs/"
        self.Y_dir = data_path + "/" + "Text Embeddings/"
        self.Z_dir = data_path

        self.df_scores = pd.read_csv(self.Z_dir + "/Mean_Scores.csv")
        self.partition = partition
        if self.partition=='train':
          self.df_scores=self.df_scores[:512]
        if self.partition=='val':
          self.df_scores=self.df_scores[512:]
        
        self.df_scores=self.df_scores.reset_index()


    def __len__(self):
        return len(self.df_scores)

    def __getitem__(self, ind):

        X_path = self.X_dir + self.df_scores['Clip_ID'][ind] + ".npy"
        X = np.load(X_path)
        Y_path = self.Y_dir + self.df_scores['Clip_ID'][ind] + ".npy"
        Y = np.load(Y_path)

        z_ext = self.df_scores['Extraversion'][ind]
        z_agr = self.df_scores['Agreeableness'][ind]
        z_con = self.df_scores['Conscientiousness'][ind]
        z_neu = self.df_scores['Neuroticism'][ind]
        z_ope = self.df_scores['Openness'][ind]


        arr=np.zeros(5)
        arr[0]=z_ext
        arr[1]=z_agr
        arr[2]=z_con
        arr[3]=z_neu
        arr[4]=z_ope
        z_vector=torch.from_numpy((arr>0.5))
        z_vector=z_vector.type(torch.float)
        X = torch.from_numpy(X)
        Y = torch.from_numpy(Y)
        
        return X, Y, z_vector
    
    def collate_fn(self, batch):
        batch_x  = [torch.nn.functional.pad(x, (0,401-x.shape[2],0,0), value = 0) for x,y,z_vector in batch]
        batch_y  = [y for x,y,z_vector in batch]
        batch_z_vector = [z_vector for x,y,z_vector in batch]



        # return torch.cat(batch_x), torch.stack(batch_y), torch.tensor(batch_z1), torch.tensor(batch_z2), torch.tensor(batch_z3), torch.tensor(batch_z4), torch.tensor(batch_z5)
        return torch.cat(batch_x), torch.stack(batch_y), torch.stack(batch_z_vector)   

In [None]:
root="/content/drive/MyDrive/IDL Project/Final-Submission/Merged Dataset"
train_data = LibriSamples(root, 'train')
val_data = LibriSamples(root,'val')
train_loader = DataLoader(train_data, batch_size = 8, shuffle = True, num_workers = 0, collate_fn = train_data.collate_fn) 
val_loader = DataLoader(val_data, batch_size = 8, shuffle = False, num_workers = 0, collate_fn = train_data.collate_fn) 


In [None]:
#While we have desgined the architecture on our own, it is inspired by a homework in another course thaught at CMU, that each member of the team has taken.
#This course is 16-824 Visual Learning and Recognition which we have taken in Spring 2022.
#The homework was based on the https://arxiv.org/pdf/1606.00061.pdf

In [None]:
class QuestionFeatureExtractor(nn.Module):
      """
      Inputs:
          Q: question_encoding in a shape of B x T x word_inp_size
      Outputs:
          qw: word-level feature in a shape of B x T x embedding_size
          qs: phrase-level feature in a shape of B x T x embedding_size
          qt: sentence-level feature in a shape of B x T x embedding_size
      """


class AlternatingCoAttention(nn.Module):
    """
    The Alternating Co-Attention module as in (Lu et al, 2017) paper Sec. 3.3.
    """
    def __init__(self, d=40, k=40, dropout=0.1):
        super().__init__()
        self.d = d
        self.k = k

        self.Wx1 = nn.Linear(d, k)
        self.whx1 = nn.Linear(k, 1)

        self.Wx2 = nn.Linear(d, k)
        self.Wg2 = nn.Linear(d, k)
        self.whx2 = nn.Linear(k, 1)

        self.Wx3 = nn.Linear(d, k)
        self.Wg3 = nn.Linear(d, k)
        self.whx3 = nn.Linear(k, 1)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, Q, V):
        """
        Inputs:
            Q: question feature in a shape of BxTxd
            V: image feature in a shape of BxNxd
        Outputs:
            shat: attended question feature in a shape of Bxk
            vhat: attended image feature in a shape of Bxk
        """
        B = Q.shape[0]

        # 1st step
        H = torch.tanh(self.Wx1(Q))
        H = self.dropout(H)
        ax = F.softmax(self.whx1(H), dim=1)
        shat = torch.sum(Q * ax, dim=1, keepdim=True)

        # 2nd step
        H = torch.tanh(self.Wx2(V) + self.Wg2(shat))
        H = self.dropout(H)
        ax = F.softmax(self.whx2(H), dim=1)
        vhat = torch.sum(V * ax, dim=1, keepdim=True)

        # 3rd step
        H = torch.tanh(self.Wx3(Q) + self.Wg3(vhat))
        H = self.dropout(H)
        ax = F.softmax(self.whx3(H), dim=1)
        shat2 = torch.sum(Q * ax, dim=1, keepdim=True)

        return shat2.squeeze(), vhat.squeeze()

In [None]:
class CoattentionNet(nn.Module):
    """
    Predicts an answer to a question about an image using the Hierarchical Question-Image Co-Attention
    for Visual Question Answering (Lu et al, 2017) paper.
    """
    def __init__(self):
        super().__init__()
        self.text_feats_layer = nn.Linear(1024, 40)

        self.attention_layer = AlternatingCoAttention()

        self.Ww = nn.Linear(40, 40)
        self.Ws = nn.Linear(40, 30)

        self.dropout = nn.Dropout(p = 0.1) 

        self.classifier = nn.Linear(30, 5)

    def forward(self, audio_feats, text_feats):
        text_feats = text_feats.unsqueeze(dim = 1)
        text_feats = self.text_feats_layer(text_feats)
        
        audio_feats = audio_feats.permute(0, 2, 1)
        shat, vhat = self.attention_layer(text_feats, audio_feats)
        hw = torch.tanh(self.Ww(torch.add(shat, vhat)))
        output = self.classifier(self.Ws(hw))
        return output

In [None]:
model = CoattentionNet().cuda()

In [None]:
num_epochs = 40
# optimizer = optim.Adam(model.parameters(), lr = 0.01)
optimizer = torch.optim.RMSprop(model.parameters(), lr = 4e-4, weight_decay = 1e-8)
#loss_fn = nn.CrossEntropyLoss(weight=None, size_average=None, ignore_index=- 100, reduce=None, reduction='mean', label_smoothing=0.1)
loss_fn = nn.BCEWithLogitsLoss(weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = len(train_loader) * num_epochs, eta_min=0, last_epoch=- 1, verbose=False)

In [None]:
import sklearn.metrics as sm
def validate(val_loader,model):
    num_correct=0
    outputs=np.array([])
    preds=np.array([])
    for batch_id, batch_data in enumerate(val_loader):
      gc.collect()
      torch.cuda.empty_cache()
      model.eval() 

      x, y, z_vector = batch_data
      with torch.no_grad():
        output = model.forward(x.cuda(), y.cuda())
        output = torch.sigmoid(output)
        output = (output>0.5).type(torch.int)
        optimizer.zero_grad()
        
        num_correct += int((output.cuda() == z_vector.cuda()).sum())
      output_flat=output.cpu().detach().numpy().flatten()
      z_vector_flat=z_vector.cpu().detach().numpy().flatten()
      outputs=np.append(outputs,output_flat)
      preds=np.append(preds,z_vector_flat)
        #loss = loss_fn(output, z_vector.cuda())
      
    
    recall=sm.recall_score(preds,outputs)
    precision=sm.precision_score(preds,outputs)
    f1=sm.f1_score(preds,outputs)
    return (num_correct/(128*5),recall,precision,f1)

In [None]:
def train():
    for epoch in range(num_epochs):
        print('Epoch Number',epoch+1)
        num_batches = len(train_loader)
        for batch_id, batch_data in enumerate(train_loader):
            gc.collect()
            torch.cuda.empty_cache()
            model.train()
            current_step = epoch * num_batches + batch_id

            x, y, z_vector = batch_data
            output = model.forward(x.cuda(), y.cuda())
            optimizer.zero_grad()
            loss = loss_fn(output, z_vector.cuda())
            loss.backward()
            optimizer.step()
            scheduler.step()
        acc,recall,precision,f1=validate(val_loader,model)
        print('Val Accuracy: ',acc)
        print('Val Recall: ',recall)
        print('Val Precision: ',precision)
        print("Val F1-Score: ",f1)
        wandb.log({'epoch':epoch, 'loss': loss, 'val_acc':acc,'recall':recall,'precision':precision,'f1-score':f1})

In [None]:
train()