In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import io
import re
import glob
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from pprint import pprint, pformat
from IPython.display import display, Markdown, Latex

from load_audio import  AudioFeatureDataset
from load_text import TextDataset
from load_video import VideoDataset
from load_multimodal_data import MultimodalDataset

from position_encoder import PositionalEncoding

from encoders import AudioEncoder, VideoEncoder, DotProductAttention

from decoder import MultimodalDecoder
from tqdm import tqdm

In [3]:
os.getcwd()

'C:\\Users\\marti\\Desktop\\Projects\\majore\\scripts\\data'

In [4]:
# use_cuda = torch.cuda.is_available()
# device = torch.device("cuda" if use_cuda else "cpu")

# if use_cuda:
#     torch.set_default_tensor_type(torch.cuda.FloatTensor)

# device

device = torch.device("cpu")

In [5]:
# path_how2 = "/Volumes/LaCie/vision/data/" # Jeremy
# path_how2 = "/Volumes/T7/data/" # Romain
path_how2 = '../../data/how2-dataset/' # Martin

video_path = os.path.join(path_how2, "resnext101-action-avgpool-300h", "train.npy")

texts_path = os.path.join(path_how2,"how2-300h-v1/data/train", "text.en")
embeddings_path = os.path.join(path_how2, "how2-release/word_embedding/","cmu_partition.train.vec")

In [6]:
video_dataset = VideoDataset(video_path)
audio_dataset = AudioFeatureDataset(path_how2,"train")
text_dataset = TextDataset(texts_path, embeddings_path)

print(len(video_dataset),len(audio_dataset),len(text_dataset))

  return  np.array(text), np.array(splitted), largest_split


184949 184949 184949


In [7]:
vocab_id_dict = text_dataset.vocab_id_dict
id_vocab_dict = text_dataset.id_vocab_dict
vocab_emb_dict = text_dataset.vocab_emb_dict

In [8]:
len(vocab_emb_dict), len(id_vocab_dict), len(vocab_emb_dict)

(36756, 36756, 36756)

In [9]:
multimodal_dataset = MultimodalDataset(video_dataset, audio_dataset, text_dataset)
dataloader = DataLoader(multimodal_dataset, batch_size=4, shuffle=True, num_workers=0)

In [10]:
for batch in dataloader:
    
    print(batch["video"]["video"].shape)
    print(batch["audio"].shape)
    print(batch["text"]["embedding"].shape)
    print(batch["text"]["id_embedding"].shape)
    break

torch.Size([4, 2048])
torch.Size([4, 10810, 43])
torch.Size([4, 225, 100])
torch.Size([4, 225, 1])


In [11]:
d_model = 480
d_feedforward = 1920
dropout = 0.2
nhead = 6
nlayer_audio = 6
nlayer_video = 1

video_dim = 2048
audio_size = 10810
audio_dim = 43
tied = 48
down_sampling_factor = 10

audio_encoder = AudioEncoder(audio_dim,
                             audio_size,
                             tied,
                             nhead,
                             nlayer_audio,
                             d_model,
                             d_feedforward,
                             dropout,
                             down_sampling_factor)

video_encoder = VideoEncoder(video_dim,
                             nhead,
                             nlayer_video,
                             d_model,
                             d_feedforward,
                             dropout)

dot_product_attention = DotProductAttention(d_model, 480, 480).to(device)

In [12]:
batch1 = next(iter(dataloader))
audio = batch1["audio"].float()
video = batch1["video"]["video"].unsqueeze(1).float()
text = batch1["text"]["embedding"].float()

In [13]:
out_audio = audio_encoder(audio)
out_video = video_encoder(video)
output_encoder = dot_product_attention.forward(out_audio, out_video, out_video)

print(out_audio.shape)
print(out_video.shape)
print(output_encoder.shape)

torch.Size([4, 1081, 480])
torch.Size([4, 1, 480])
torch.Size([4, 1081, 480])


In [14]:
print("VIDEO")
print(" IN:  ", tuple(video.size()))
print(" OUT: ", tuple(out_video.size()))
print()
print("AUDIO")
print(" IN:  ", tuple(audio.size()))
print(" OUT: ", tuple(out_audio.size()))
print()
print("FUSION")
print(" OUT: ", tuple(output_encoder.size()))

VIDEO
 IN:   (4, 1, 2048)
 OUT:  (4, 1, 480)

AUDIO
 IN:   (4, 10810, 43)
 OUT:  (4, 1081, 480)

FUSION
 OUT:  (4, 1081, 480)


In [15]:
text_dim = 100
text_size = 225
vocab_size = len(vocab_id_dict)
n_layer = 4
d_model = 480
d_feedforward = 1920
dropout = 0.2
nhead = 6

decoder = MultimodalDecoder(text_dim, text_size, vocab_size, nhead,  n_layer, d_model, d_feedforward, dropout).to(device)

In [16]:
mask = decoder.generate_square_subsequent_mask(text_size)
decoded = decoder(text, output_encoder, tgt_mask = mask)

In [17]:
mask

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [18]:
print("Mask Size: ", mask.size())
print("Text Size: ", text.size())
print("Output Encoder Size: ", output_encoder.size())

Mask Size:  torch.Size([225, 225])
Text Size:  torch.Size([4, 225, 100])
Output Encoder Size:  torch.Size([4, 1081, 480])


In [19]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        d_model = 480
        d_feedforward = 1920
        dropout = 0.2
        nhead = 6
        nlayer_audio = 6
        nlayer_video = 1

        video_dim = 2048
        audio_size = 10810
        audio_dim = 43
        tied = 48
        down_sampling_factor = 10

        self.audio_encoder = AudioEncoder(
            audio_dim,
            audio_size,
            tied,
            nhead,
            nlayer_audio,
            d_model,
            d_feedforward,
            dropout,
            down_sampling_factor
        )

        self.video_encoder = VideoEncoder(
            video_dim,
            nhead,
            nlayer_video,
            d_model,
            d_feedforward,
            dropout
        )

        self.fusion = DotProductAttention(d_model, 480, 480)
        
        text_dim = 100
        text_size = 225
        vocab_size = len(vocab_id_dict)
        n_layer = 4
        d_model = 480
        d_feedforward = 1920
        dropout = 0.2
        nhead = 6

        self.decoder = MultimodalDecoder(
            text_dim, text_size, vocab_size, nhead,  n_layer, d_model, d_feedforward, dropout
        )
        
    def forward(self, video, audio, text):
        audio_encoding = self.audio_encoder(audio)
        video_encoding = self.video_encoder(video)
        merge_encoding = self.fusion(audio_encoding, video_encoding, video_encoding)
        mask = decoder.generate_square_subsequent_mask(text_size)
        decoded = decoder(text, output_encoder, tgt_mask=mask)
        return decoded

In [20]:
batch1 = next(iter(dataloader))
audio = batch1["audio"].float()
video = batch1["video"]["video"].unsqueeze(1).float()
text = batch1["text"]["embedding"].float()

In [21]:
net = Net().to(device)

In [22]:
output = net(video, audio, text)

In [23]:
output.size()

torch.Size([225, 4, 36756])

In [24]:
text.size()

torch.Size([4, 225, 100])

In [25]:
# if torch.cuda.is_available():
#     device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
#     print("Running on the GPU")
# else:
#     device = torch.device("cpu")
#     print("Running on the CPU")

In [26]:
loss_function = torch.nn.CrossEntropyLoss() # includes softmax

def train(net):
    BATCH_SIZE = 100
    EPOCHS = 10
    
    dataloader = DataLoader(multimodal_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    for epoch in range(EPOCHS):
        for batch in tqdm(dataloader):
            
            audio = batch["audio"].float()
            video = batch["video"]["video"].unsqueeze(1).float()
            text = batch["text"]["embedding"].float()
            
            audio = audio.to(device)
            video = video.to(device)
            text = text.to(device)
            

            net.zero_grad()
            outputs = net(video, audio, text)
            loss = loss_function(outputs, batch_y)
            loss.backward()
            optimizer.step()
        print(loss)

In [27]:
train(net)

  0%|                                                                                         | 0/1850 [01:05<?, ?it/s]


RuntimeError: shape '[-1, 600, 80]' is invalid for input of size 2075520