In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# PATH = '/content/drive/MyDrive/INF634Project/majore/notebooks/' # Martin
PATH = '/content/drive/MyDrive/Colab Notebooks/INF634Project/majore/notebooks/' # Romain

path_how2 = '../../data/how2-dataset/'

%cd $PATH

/content/drive/.shortcut-targets-by-id/1QyXXAAL-qu9R6UWrvIbpgSNxkpiMFLWV/INF634Project/majore/notebooks


In [None]:
!pip install kaldiio



In [None]:
import os
import io
import re
import glob
from pprint import pprint, pformat

import numpy as np
import torch
# torch.multiprocessing.set_start_method('spawn')
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from IPython.display import display, Markdown, Latex
from tqdm import tqdm

from scripts.load_audio_v1 import  AudioFeatureDataset
from scripts.load_text import TextDataset
from scripts.load_video import VideoDataset
from scripts.load_multimodal_data import MultimodalDataset
from scripts.position_encoder import PositionalEncoding
from scripts.encoders import AudioEncoder, VideoEncoder, DotProductAttention
from scripts.decoder import MultimodalDecoder

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [None]:
# path_how2 = "/Volumes/LaCie/vision/data/" # Jeremy
# path_how2 = "/Volumes/T7/data/" # Romain

video_path = os.path.join(path_how2, "resnext101-action-avgpool-300h", "train.npy")

texts_path = os.path.join(path_how2,"how2-300h-v1/data/train", "text.en")
embeddings_path = os.path.join(path_how2, "how2-release/word_embedding/","cmu_partition.train.vec")

In [None]:
video_dataset = VideoDataset(video_path)
audio_dataset = AudioFeatureDataset(path_how2,"train")
text_dataset = TextDataset(texts_path, embeddings_path)

print("Len Video: ", len(video_dataset))
print("Len Audio: ", len(audio_dataset))
print("Len Text: ", len(text_dataset))

multimodal_dataset = MultimodalDataset(video_dataset, audio_dataset, text_dataset)

print("\nLen Multimodal: ", len(multimodal_dataset))

Len Video:  184949
Len Audio:  184949
Len Text:  184949

Len Multimodal:  184949


In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size, text_size=225):
        super().__init__()
        
        d_model = 240
        
        dropout = 0.2
        nhead = 3
        nlayer_audio = 3
        nlayer_video = 1

        video_dim = 2048
        audio_feature_size = 43
        audio_seq_len = 1895
        tied_output = 48
        down_sampling_factor = 5

        self.audio_encoder = AudioEncoder(
            audio_feature_size,
            audio_seq_len,
            tied_output,
            nhead,
            nlayer_audio,
            d_feedforward=d_model,
            dropout=dropout,
            down_sampling_factor=down_sampling_factor
        )

        self.video_encoder = VideoEncoder(
            video_dim,
            nhead,
            nlayer_video,
            d_model=d_model,
            d_feedforward=d_model,
            dropout=dropout
        )

        self.fusion = DotProductAttention(d_model, d_model, d_model)
        
        text_dim = 100
        self.text_size = text_size
        n_layer = 3
        d_feedforward = 512
        dropout = 0.2
        nhead = 3

        self.decoder = MultimodalDecoder(
            text_dim, self.text_size, vocab_size, nhead,  n_layer, d_model, d_feedforward, dropout
        )
        
    def forward(self, video, audio, text):
        audio_encoding = self.audio_encoder(audio)
        video_encoding = self.video_encoder(video)
        merge_encoding = self.fusion(audio_encoding, video_encoding, video_encoding)
        mask = self.decoder.generate_square_subsequent_mask(self.text_size)
        decoded = self.decoder(text, merge_encoding, tgt_mask=mask)
        return decoded

In [None]:
word2id= text_dataset.vocab_id_dict
id2word = text_dataset.id_vocab_dict

vocab_size = len(word2id)
net = Net(vocab_size).to(device)

In [None]:
pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
pytorch_total_params

13073124

In [None]:
import json

path_json = os.path.join(PATH, "history.json")

def save_json(data, path_json):
    with open(path_json, "w") as fd:
      json.dump(data, fd)

In [None]:
def process_batch(batch):
    audio = batch["audio"].float()
    video = batch["video"]["video"].unsqueeze(1).float()
    text_emb = batch["text"]["embedding"].float()
    text_id = batch["text"]["id_embedding"].long()
    
    return video, audio, text_emb, text_id

optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss() # includes softmax

history = {
    "loss_train": [],
    "loss_valid": []
}

def train(net):
    global history
    
    BATCH_SIZE = 32
    EPOCHS = 1
    
    dataloader = DataLoader(multimodal_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch}")
        
        for k in history:
            history[k].append([])
        
        for i_batch, batch in tqdm(enumerate(dataloader)):
            
            video, audio, text_emb, text_id = process_batch(batch)
            
            audio = audio.to(device)
            video = video.to(device)
            text_emb = text_emb.to(device)
            text_id = text_id.to(device)
            
            net.zero_grad()
            
            output = net(video, audio, text_emb)
             # out is of shape [225, 4, 36756] => now of shape [4, 225, 36756]
            output = torch.transpose(output, 0, 1).reshape(-1, vocab_size)
            
            target = text_id.view(-1).long()
            loss = criterion(output,target)

            loss.backward()
            optimizer.step()
            
            history["loss_train"][-1].append(loss.item())
            
            if i_batch % 10:
                save_json(history,path_json)
        save_json(history,path_json)

In [28]:
train(net)





0it [00:00, ?it/s][A[A[A[A

Epoch 0


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m



4531it [28:40,  2.61it/s][A[A[A[A



4532it [28:40,  2.63it/s][A[A[A[A



4533it [28:41,  2.64it/s][A[A[A[A



4534it [28:41,  2.66it/s][A[A[A[A



4535it [28:41,  2.68it/s][A[A[A[A



4536it [28:42,  2.67it/s][A[A[A[A



4537it [28:42,  2.59it/s][A[A[A[A



4538it [28:43,  2.59it/s][A[A[A[A



4539it [28:43,  2.59it/s][A[A[A[A



4540it [28:43,  2.59it/s][A[A[A[A



4541it [28:44,  2.63it/s][A[A[A[A



4542it [28:44,  2.64it/s][A[A[A[A



4543it [28:45,  2.67it/s][A[A[A[A



4544it [28:45,  2.65it/s][A[A[A[A



4545it [28:45,  2.62it/s][A[A[A[A



4546it [28:46,  2.63it/s][A[A[A[A



4547it [28:46,  2.60it/s][A[A[A[A



4548it [28:46,  2.58it/s][A[A[A[A



4549it [28:47,  2.38it/s][A[A[A[A



4550it [28:47,  2.46it/s][A[A[A[A



4551it [28:48,  2.54it/s][A[A[A[A



4552it [28:48,  2.51it/s][A[A[A[A



455