In [2]:
import torch
import numpy as np
import os
import json
import joblib
from torch.utils.data import Dataset,DataLoader
from itertools import repeat
import pandas as pd
import math
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

In [13]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)
output = loss(m(input), target)
output.backward()

In [14]:
input

tensor([ 0.4280,  2.1671, -0.4850], requires_grad=True)

In [15]:
target

tensor([1., 1., 0.])

In [9]:
class tdset(Dataset):
    def __init__(self,ids):
        self.ids = ids
        self.sz = np.random.randint(5,10)
    def __len__(self):
        return len(self.ids)
    def __getitem__(self,idx):
        return np.random.randn(self.sz)


In [10]:
dset = tdset([0,9,8,6,8,4])
dset2 = tdset([0,9,8,6,8,4])

In [14]:
# 50 percentile length is 25 frames.Thus we will have the sequence of video-frames = 25

class TOYMODEL(pl.LightningModule):
    def __init__(self,hparams):
        super().__init__()
        self.save_hyperparameters(hparams)
        self.lin = nn.Linear(15,1)

    def forward(self,x):
        return self.lin(x)

    def training_step(self,batch,batch_idx):
        out = self.lin(x)
        

    


In [28]:
#utils

def get_vids(base_dir,split):
    trn_split = base_dir+split
    trn_idlst = []
    trn_vidlst = []

    f = open(trn_split,'r')
    for line in f:
        id_,vid = line.split('/')
        vid = vid.strip('\n')
        trn_idlst.append(id_)
        trn_vidlst.append(vid)
        #print(vid)
        #break
    f.close()
    return trn_idlst,trn_vidlst

    
def get_features(data_dir,split='val',feat_dir='/common/users/vk405/feat_csv/'):
    #feat_dir = data_dir
    splits_dir = data_dir+'splits/'
    if split == 'val':
        feat_split_dir = feat_dir+'val_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'val_list.txt')  
    elif split == 'train':
        feat_split_dir = feat_dir+'train_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'train_list.txt') 
    elif split == 'test':
        feat_split_dir = feat_dir+'test_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'test_list.txt')
    else:
        raise NotImplementedError(f'unknown split: {split}')     
    feat_list = {}
    vid_dtls = []
    for num,name in zip(vid_num,vid_name):
        feat_loc = os.path.join(feat_split_dir, f'{num}/{name}/0001/')
        #import pdb;pdb.set_trace()
        if os.path.isdir(feat_loc):
            feat_files = feat_loc + os.listdir(feat_loc)[0]
            feat_list[name] = feat_files
            #feat_list.append(feat_files)
            vid_dtls.append((num,name))
        else:
            print(f"video : {num}/{name} not found")
    assert len(feat_list) == len(vid_dtls),"get-features is giving incorrect features"
    return feat_list,vid_dtls






def get_raw_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns['database']:
                #import pdb;pdb.set_trace()
                duration = annotns['database'][vidname]['duration']
                annot = annotns['database'][vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    sent = segment_info['sentence']
                    labels.append((interval,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info

def regress_labels(raw_labels):
    regress_labels = {}
    for key in raw_labels:
        new_labels = []
        for item in raw_labels[key]:
            rng,sent,vidlen = item
            mid = sum(rng)/2
            duration = rng[-1]-rng[0]
            mid_pred = (1/vidlen)*mid # location of mid-point w.r.t video length
            duration_pred = (1/vidlen)*duration
            new_labels.append(([mid_pred,duration_pred],sent))
        regress_labels[key] = new_labels
    return regress_labels
            
            
    
    
    


In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

class LabelEncoder2(BaseEstimator, TransformerMixin):
    def __init__(self,max_len=499):
        self.vidlens = []
        self.truebounds = []
        self.max_len = max_len

    def fit(self,raw_labels):
        l = []
        for key in raw_labels:
            vid_len = raw_labels[key][0][-1]
            sz = len(raw_labels[key])
            for i in range(sz):l.append(vid_len)
        self.vidlens = np.array(l)
        return self
        
    def transform(self,raw_labels):
        regress_labels = self._regress_labels(raw_labels)
        return regress_labels

    def decode(self,outputs):
        return np.round(outputs*self.max_len)


    def _regress_labels(self,raw_labels):
        regress_labels = {}
        bounds = []
        for key in raw_labels:
            new_labels = []
            for item in raw_labels[key]:
                #import pdb;pdb.set_trace()
                rng,sent,vidlen = item
                new_rng = [rng[0]/self.max_len,rng[-1]/self.max_len]
                bounds.append(rng)
                new_labels.append((new_rng,sent))
            regress_labels[key] = new_labels
        self.truebounds = np.array(bounds)
        return regress_labels


In [30]:

def get_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        #print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns:
                #import pdb;pdb.set_trace()
                duration = annotns[vidname]['duration']
                annot = annotns[vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    st_end = [interval[0],interval[-1]]
                    sent = segment_info['sentence']
                    labels.append((st_end,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info

In [31]:
#dataset
# Dataset/loader
# This is newer version
class YoucookDset2(Dataset):
    def __init__(self,data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'\
        ,split='train',use_precomp_emb=True,seqlen=25,framecnt=500,id=0):
        self.id = id
        self.feat_locs = {}
        self.split = split
        self.data_dir = data_dir
        self.use_precomp_emb = use_precomp_emb
        self.text_emb = None
        self.seqlen = seqlen
        self.framecnt = framecnt
        if self.split != 'test':
            self.annotns_file = data_dir+'annotations/segment_youcookii_annotations_trainval.json'
        else:
            raise NotImplementedError(f"Split:{self.split},not yet correctly implemented")
        if self.use_precomp_emb:
            self.txt_emb = joblib.load(os.path.join(self.data_dir,'emb.joblib'))

        self.feat_locs,vids = get_features(self.data_dir,split=self.split)
        assert len(vids) == len(self.feat_locs),"features are wrong"
        #import pdb;pdb.set_trace()
        label_info = get_labels(vids,self.annotns_file)
        #self.labelencoder = LabelEncoder2()
        self.final_labels = label_info
        #self.labelencoder.fit_transform(label_info)
        
        #regress_labels(label_info)
        #(vid_id,seg_id)
        self.update_data()

                
            
    def __len__(self):
        return len(self.data)

    def update_data(self):
        self.data = []
        #self.vid_len = []
        starting_pnt = np.arange(self.id,self.framecnt,self.seqlen)

        for key in self.final_labels:
            annot_len = len(self.final_labels[key])
            if key in self.feat_locs:
                file_loc = self.feat_locs[key]
                #for stpnt in starting_pnt:
                segments = list(zip(repeat(key,annot_len),repeat(file_loc,annot_len),\
                        range(annot_len)))
                for seg in segments:
                    for stpnt in starting_pnt:
                        if stpnt+self.seqlen<=self.framecnt:
                            datapnt = seg[:-1]+(stpnt,)+seg[-1:]
                            self.data.append(datapnt)
                    
                #self.data.extend(segments)
            else:
                print(f"video:{key} not found")


    def getclass_prob(self,lbl_rng,frame_rng):
        lbl_ids = set(np.arange(lbl_rng[0],lbl_rng[-1]+1))
        frame_ids = set(np.arange(frame_rng[0],frame_rng[-1]+1))
        inter = lbl_ids.intersection(frame_ids)
        if len(inter) == 0:
            return 0.0
        else:
            return len(inter)/len(lbl_ids.union(frame_ids))

    def __getitem__(self,idx):
        if self.use_precomp_emb:
            vidname,file_loc,stid,seg_ind = self.data[idx]
            #import pdb;pdb.set_trace()
            #self.txt_emb[vidname][seg_ind],
            txt_info = self.final_labels[vidname][seg_ind]
            label_value = self.getclass_prob(txt_info[0],(stid,stid+self.seqlen-1))
            return pd.read_csv(file_loc).values.astype(np.float32)[stid:stid+self.seqlen,:],(self.txt_emb[vidname][seg_ind]).astype(np.float32),\
                label_value
            #np.array(self.final_labels[vidname][seg_ind][0],dtype=np.float32)
        else:
            raise NotImplementedError("not yet correctly implemented")

        

           



        

In [40]:

data_dir = '/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'



youcookdata = YoucookDset2(split='val')

In [47]:
#load dataset and dataloader once every epoch, makes stuff a bit slow but this is a simple approach.

In [48]:
youcookdata.data

[('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  0,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  25,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  50,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  75,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  100,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  125,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet_34_feat_mscoco.csv',
  150,
  0),
 ('sdB8qBlLS2E',
  '/common/users/vk405/feat_csv/val_frame_feat_csv/405/sdB8qBlLS2E/0001/resnet

In [49]:
youcookdata.final_labels['sdB8qBlLS2E']

[([18, 29], 'add cardamom seeds to a pan of hot oil and shake', 358.72),
 ([32, 53], 'add chili flakes to the pan and shake', 358.72),
 ([58, 78],
  'add turmeric cumin and coriander powder to the pan and shake',
  358.72),
 ([79, 111], 'add garlic ginger paste and onion  to the pan and stir', 358.72),
 ([139, 184], 'add lamb to the pan and stir', 358.72),
 ([222, 251], 'add chili powder to the pan and mix', 358.72),
 ([278, 285], 'add chopped tomato to the pan', 358.72),
 ([377, 406],
  'add green chilis and garam masala powder to the pan and stir',
  358.72),
 ([455, 465], 'add coriander leaves to the pan', 358.72)]

In [51]:
youcookdata[0][-1]

0.23333333333333334

In [52]:
youcookdata[1][-1]

0.15625

In [16]:
#model utils

#!pip install transformers

def init_parameters_xavier_uniform(model):
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

def scaled_dot(query, key, mask_key=None):  
    score = torch.matmul(query, key.transpose(-2, -1))
    score /= math.sqrt(query.size(-1))
    if mask_key is not None:
        score = score.masked_fill(mask_key, -1e18)  # Represents negative infinity
    return score      
            
def attend(query, key, value, mask_key=None, dropout=None):
    # TODO: Implement
    # Use scaled_dot, be sure to mask key
    #smax = nn.Softmax(-1)
    #import pdb;pdb.set_trace()
    score = scaled_dot(query,key,mask_key)  
    attention = F.softmax(score,dim=-1)
    if dropout is not None:#do = nn.Dropout(dropout)
        attention = dropout(attention)
    answer = torch.matmul(attention,value) 
    # Convexly combine value embeddings using attention, this should be just a matrix-matrix multiplication.
    return answer, attention



def split_heads(batch, num_heads):  
    (batch_size, length, dim) = batch.size()  # These are the expected batch dimensions.
    assert dim % num_heads == 0  # Assert that dimension is divisible by the number of heads.
    dim_head = dim // num_heads

    # No new memory allocation
    splitted = batch.view(batch_size, -1, num_heads, dim_head).transpose(1, 2)  
    return splitted  # (batch_size, num_heads, length, dim_head), note that now the last two dimensions are compatible with our attention functions. 




def merge_heads(batch):  
    (batch_size, num_heads, length, dim_head) = batch.size()  # These are the expected batch dimensions.

    # New memory allocation (reshape), can't avoid.
    merged = batch.transpose(1, 2).reshape(batch_size, -1, num_heads * dim_head)
    return merged  # (batch_size, length, dim)


class MultiHeadAttention(nn.Module):
    def __init__(self, dim, num_heads, dropout_rate=0.1):
        super().__init__()
        assert dim % num_heads == 0

        self.linear_query = nn.Linear(dim, dim)
        self.linear_key = nn.Linear(dim, dim)
        self.linear_value = nn.Linear(dim, dim)
        self.linear_final = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout_rate)

        self.num_heads = num_heads

    def forward(self, query, key, value, mask_key=None, layer_cache=None,
              memory_attention=False):
        """
        INPUT
          query: (batch_size, length_query, dim)
          key: (batch_size, length_key, dim)
          value: (batch_size, length_key, dim_value)
          mask_key: (*, 1, length_key) if queries share the same mask, else
                    (*, length_query, length_key)
          layer_cache: if not None, stepwise decoding (cache of key/value)
          memory_attention: doing memory attention in stepwise decoding?
        OUTPUT
          answer: (batch_size, length_query, dim_value)
          attention: (batch_size, num_heads, length_query, length_key) else
        """
        batch_size = query.size(0)

        query = self.linear_query(query)
        query = split_heads(query, self.num_heads)  # (batch_size, num_heads, -1, dim_head)

        def process_key_value(key, value):  # Only called when necessary.
            key = self.linear_key(key)
            key = split_heads(key, self.num_heads)
            value = self.linear_value(value)
            value = split_heads(value, self.num_heads)
            return key, value

        #import pdb;pdb.set_trace()
        if layer_cache is None:
            key, value = process_key_value(key, value)
        else:
            assert query.size(2) == 1  # Stepwise decoding
            
            if memory_attention:
                if layer_cache['memory_key'] is None:  # One-time calculation
                    key, value = process_key_value(key, value)
                    # (batch_size, num_heads, length_memory, dim)
                    layer_cache['memory_key'] = key
                    layer_cache['memory_value'] = value

                key = layer_cache['memory_key']
                value = layer_cache['memory_value']

            else:  # Self-attention during decoding
                key, value = process_key_value(key, value)
                assert key.size(2) == 1 and value.size(2) == 1
                
                # Append to previous.
                if layer_cache['self_key'] is not None:
                    key = torch.cat((layer_cache['self_key'], key), dim=2)
                    value = torch.cat((layer_cache['self_value'], value), dim=2)
                    
                 # (batch_size, num_heads, length_decoded, dim)
                layer_cache['self_key'] = key  # Recache.
                layer_cache['self_value'] = value
        # Because we've splitted embeddings into heads, we must also split the mask. 
        # And because each query uses the same mask for all heads (we don't use different masking for different heads), 
        # we can specify length 1 for the head dimension.
        if mask_key is not None:  
            mask_key = mask_key.unsqueeze(1)  # (batch_size, 1, -1, length_key)

        answer, attention = attend(query, key, value, mask_key, self.dropout)

        answer = merge_heads(answer)  # (batch_size, length_key, dim)
        answer = self.linear_final(answer)

        return answer, attention

class PositionwiseFeedForward(nn.Module):
    def __init__(self, dim, dim_hidden, drop_rate=0.1):
        super().__init__()
        self.w1 = nn.Linear(dim, dim_hidden)
        self.w2 = nn.Linear(dim_hidden, dim)
        self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
        self.drop1 = nn.Dropout(drop_rate)
        self.relu = nn.ReLU()
        self.drop2 = nn.Dropout(drop_rate)
    def forward(self, x):
        inter = self.drop1(self.relu(self.w1(self.layer_norm(x))))
        output = self.drop2(self.w2(inter))
        return output + x




class SinusoidalPositioner(nn.Module):
    def __init__(self, dim, drop_rate=0.1, length_max=5000):
        super().__init__()
        frequency = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.) / dim))  # Using different frequency for each dim
        positions = torch.arange(0, length_max).unsqueeze(1)
        wave = torch.zeros(length_max, dim)
        wave[:, 0::2] = torch.sin(frequency * positions)
        wave[:, 1::2] = torch.cos(frequency * positions)
        self.register_buffer('wave', wave.unsqueeze(0))  # (1, length_max, dim)
        self.dropout = nn.Dropout(drop_rate)
        self.dim = dim
        self.length_max = length_max
    def forward(self, x, step=-1):
        assert x.size(-2) <= self.length_max

        if step < 0:  # Take the corresponding leftmost embeddings.
            position_encoding = self.wave[:, :x.size(-2), :]
        else:  # Take the embedding at the step.
            position_encoding = self.wave[:, step, :]

        x = x * math.sqrt(self.dim)
        return self.dropout(x + position_encoding)




class TransformerEncoderLayer(nn.Module):

  def __init__(self, dim, num_heads, dim_hidden, drop_rate):
    super().__init__()
    self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
    self.self_attention = MultiHeadAttention(dim, num_heads, drop_rate)
    self.drop = nn.Dropout(drop_rate)
    self.feedforward = PositionwiseFeedForward(dim, dim_hidden, drop_rate)

  def forward(self, source, mask_source=None):
    # TODO: Implement
    #print(source.shape)
    normed = self.layer_norm(source)  
    # Apply layer norm on source

    attended, attention = self.self_attention(normed,normed,normed,mask_source)
    #None, None  # Apply self-attention on normed (be sure to use mask_source).
    attended = self.drop(attended) + source  
    # Re-write attended by applying dropout and adding a residual connection to source.
    return self.feedforward(attended), attention




class CrossAttentionLayer(nn.Module):
    def __init__(self,dim,num_heads,dim_hidden,drop_rate):
        super().__init__()
        self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
        self.context_attention = MultiHeadAttention(dim, num_heads, drop_rate)
        self.drop = nn.Dropout(drop_rate)
        self.feedforward = PositionwiseFeedForward(dim, dim_hidden, drop_rate)
        
    def forward(self,target,memory,layer_cache=None):
        
        cross_attn_target = self.layer_norm(target)
        attended, attention = self.context_attention(cross_attn_target,memory,memory,layer_cache=layer_cache,memory_attention=True)
        
        attended = target + self.drop(attended)
        
        return self.feedforward(attended),attention



layer_cache = {'memory_key': None, 'memory_value': None, 'self_key': None, 'self_value': None}

In [41]:
#model 






class CrossattnModel(pl.LightningModule):
    def __init__(self,hparams,dset):
        super().__init__()
        self.save_hyperparameters(hparams)
        #self.hparams = hparams
        #import pdb;pdb.set_trace()
        #self.net= Model(hparams)
        #self.hparams  = hparams
        self.positioner = SinusoidalPositioner(self.hparams.edim, drop_rate=0., length_max=1000)
        self.attn = CrossAttentionLayer(self.hparams.edim,self.hparams.nheads,\
                           self.hparams.attnhdim,self.hparams.dropoutp)
        self.wrdcnn =  nn.Conv1d(self.hparams.wrdim, self.hparams.edim, 1, stride=1)
        self.vidcnn =  nn.Conv1d(self.hparams.vidim, self.hparams.edim, 1, stride=1)
        self.hid_layer = nn.Linear(self.hparams.edim,self.hparams.hdim)
        self.out_layer = nn.Linear(self.hparams.hdim,1)
        self.init_parameters_xavier_uniform()
        self.dset = dset

    def forward(self,x):
        #keep this for inference
        out = self.net(x)
        return out
        
    def net(self,x):
        vid_x,wrd_x = x
        #import pdb;pdb.set_trace()
        wrd_x = wrd_x.unsqueeze(1).transpose(1,2)
        vid_x = vid_x.transpose(1,2)
        #print(f"inside model, wrd_x:{wrd_x.shape},vi")
        tgt = self.wrdcnn(wrd_x.float()).transpose(1,2)
        src = self.vidcnn(vid_x.float()).transpose(1,2)
        src_posencode = self.positioner(src)
        #for i in range(self.hparams.lyrs):
        attended,attn_score = self.attn(tgt,src_posencode)
            #tgt = 
        out = F.sigmoid(self.out_layer(F.relu(self.hid_layer(F.relu(attended)))))
        return out



    def training_step(self,batch,batch_idx):
        #for tranining
        vid_feat,wrd_feat,labels = batch
        x_hat = self.net((vid_feat.float(),wrd_feat.float()))
        #import pdb;pdb.set_trace()
        #loss = nn.BCELoss()
        loss = F.binary_cross_entropy(x_hat.squeeze().float(), labels.squeeze().float())
        #print(f"inside train step, loss:{loss}")
        self.log("train_loss",loss,on_step=True)
        return loss

    def validation_step(self,batch,batch_idx):
        #for validation
        vid_feat,wrd_feat,labels = batch
        x_hat = self.net((vid_feat.float(),wrd_feat.float()))
        #import pdb;pdb.set_trace()
        loss = F.binary_cross_entropy(x_hat.squeeze().float(), labels.squeeze().float())
        #print(f"inside train step, loss:{loss}")
        self.log("val_loss",loss,on_step=False, on_epoch=True)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    
    def init_parameters_xavier_uniform(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def train_dataloader(self):
        ind = self.current_epoch % self.hparams.seqlen
        self.dset.id = ind
        self.dset.update_data()
        print(f"train dataloader called with ind:{ind}")
        return DataLoader(self.dset,self.hparams.batch_sz,shuffle=True,num_workers=10)
        



In [35]:
from argparse import Namespace
hparams = Namespace(
    edim = 100,
    attnhdim = 50,
    nheads = 10,
    wrdim = 768,
    vidim = 512,
    hdim = 30,
    dropoutp=0.1,
    seqlen=25,
    framecnt=500,
    batch_sz=64


)

In [21]:
#seqlen=25,framecnt=500,id=0

0

In [44]:
log_dir = '/common/home/vk405/Projects/Crossmdl/nbs/lightning_logs/'

from pytorch_lightning.loggers import CSVLogger
version = '2'
csvlogger = CSVLogger(log_dir,version)
trainer = pl.Trainer(logger= csvlogger,\
    gpus=1,max_epochs=15,reload_dataloaders_every_epoch=True)
#youcoodvld_dl = DataLoader(youcookvld2,batch_size=64,shuffle=False,num_workers=10)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [50]:
#trainer.fit(model,youcookdl)
#model = CrossattnModel(hparams)

youcookdata = YoucookDset2()


import torch.utils.data as data_utils
#indices = torch.arange(1000)
#youcookdatasubset = data_utils.Subset(youcookdata, indices)
youcookdl = DataLoader(youcookdata,batch_size=64,shuffle=True,num_workers=10)



model = CrossattnModel(hparams,youcookdata)


In [49]:
#sanity checking

for batch in youcookdl:
    batch[0].shape
    break

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/common/home/vk405/miniconda3/envs/Crossmdl/lib/python3.8/site-packages/pytorch_lightning/utilities/auto_restart.py", line 474, in _capture_metadata_collate
    data = default_collate(samples)
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [25, 512] at entry 0 and [24, 512] at entry 31


In [47]:
trainer.fit(model,youcookdl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name       | Type                 | Params
----------------------------------------------------
0 | positioner | SinusoidalPositioner | 0     
1 | attn       | CrossAttentionLayer  | 51.0 K
2 | wrdcnn     | Conv1d               | 76.9 K
3 | vidcnn     | Conv1d               | 51.3 K
4 | hid_layer  | Linear               | 3.0 K 
5 | out_layer  | Linear               | 31    
----------------------------------------------------
182 K     Trainable params
0         Non-trainable params
182 K     Total params
0.729     Total estimated model params size (MB)


Epoch 0:   0%|          | 0/16 [00:00<?, ?it/s] 

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/common/home/vk405/miniconda3/envs/Crossmdl/lib/python3.8/site-packages/pytorch_lightning/utilities/auto_restart.py", line 474, in _capture_metadata_collate
    data = default_collate(samples)
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/common/home/vk405/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [25, 512] at entry 0 and [24, 512] at entry 40
