In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import io
import re
import glob
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from pprint import pprint, pformat
from IPython.display import display, Markdown, Latex

from load_audio import  AudioFeatureDataset
from load_text import TextDataset
from load_video import VideoDataset
from load_multimodal_data import MultimodalDataset

from position_encoder import PositionalEncoding

from encoders import AudioEncoder, VideoEncoder, DotProductAttention, DownSampling

from decoder import MultimodalDecoder


In [None]:
# path_how2 = "/Volumes/LaCie/vision/data/" # Jeremy
path_how2 = "/Volumes/T7/data/" # Romain

video_path = os.path.join(path_how2, "resnext101-action-avgpool-300h", "train.npy")

texts_path = os.path.join(path_how2,"how2-300h-v1/data/train", "text.en")
embeddings_path = os.path.join(path_how2, "how2-release/word_embedding/","cmu_partition.train.vec")

In [None]:
video_dataset = VideoDataset(video_path)
audio_dataset = AudioFeatureDataset(path_how2,"train")
text_dataset = TextDataset(texts_path, embeddings_path)

print(len(video_dataset),len(audio_dataset),len(text_dataset))

In [None]:
multimodal_dataset = MultimodalDataset(video_dataset, audio_dataset, text_dataset)
dataloader = DataLoader(multimodal_dataset, batch_size=4, shuffle=True, num_workers=0)

In [None]:
for batch in dataloader:
    
    print(batch["video"]["video"].shape)
    print(batch["audio"].shape)
    print(batch["text"]["embedding"].shape)
    break

In [None]:
d_model = 480
d_feedforward = 1920
dropout = 0.2
nhead = 6
nlayer_audio = 6
nlayer_video = 1

video_dim = 2048
audio_size = 10807
audio_dim = 43
down_sampling_factor = 101

audio_encoder = AudioEncoder(
    audio_dim,
    audio_size,
    nhead,
    nlayer_audio,
    d_model,
    d_feedforward,
    dropout,
    down_sampling_factor
)

video_encoder = VideoEncoder(video_dim,
                             nhead,
                             nlayer_video,
                             d_model,
                             d_feedforward,
                             dropout)

dot_product_attention = DotProductAttention(d_model, 480, 480)

In [None]:
batch1 = next(iter(dataloader))
audio = batch1["audio"].float()
video = batch1["video"]["video"].unsqueeze(1).float()
text = batch1["text"]["embedding"].float()

In [None]:
out_audio = audio_encoder(audio)
out_video = video_encoder(video)
output_encoder = dot_product_attention.forward(out_audio, out_video, out_video)

print(out_audio.shape)
print(out_video.shape)
print(output_encoder.shape)

In [None]:
print("VIDEO")
print(" IN:  ", tuple(video.size()))
print(" OUT: ", tuple(out_video.size()))
print()
print("AUDIO")
print(" IN:  ", tuple(audio.size()))
print(" OUT: ", tuple(out_audio.size()))
print()
print("FUSION")
print(" OUT: ", tuple(output_encoder.size()))

In [None]:
text_dim = 100
text_size = 225
vocab_size = 10807 # is it by hazard?
n_layer = 4
d_model = 480
d_feedforward = 1920
dropout = 0.2
nhead = 6

decoder = MultimodalDecoder(text_dim, text_size, vocab_size, nhead,  n_layer, d_model, d_feedforward, dropout)

In [None]:
mask = decoder.generate_square_subsequent_mask(text_size)
decoded = decoder(text, output_encoder, tgt_mask = mask)

In [None]:
generate_square_subsequent_mask(10)