In [2]:
# pad your sequences
import torch
from torch.nn.utils.rnn import pad_sequence
target = torch.randn(3,4)
query = torch.randn(1,4)



out = pad_sequence([target,query],batch_first=True)
out.shape

torch.Size([2, 3, 4])

In [3]:
# get a list of data in your dataloader
# len(data) is not divisible by batch_size on purpose to verify consistency across batch sizes
#data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,\
#  collate_fn=lambda x: x)

In [21]:

batch = [torch.rand(23,4),torch.rand(10,4),torch.rand(2,4)]

sq_lens = list(map(lambda x:x.size(0),batch))
ln_key = batch[0].size(-1)
# mask = (batch_size, 1, length_key)(all queries have same mask)
mask = torch.ones(len(sq_lens),1,max(sq_lens))



In [22]:
mask.shape

torch.Size([3, 1, 23])

In [24]:
for ind,ele in enumerate(sq_lens):
    mask[ind,:,:ele] = 0.0

In [26]:
mask[2]

tensor([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1.]])

In [27]:
mask.shape

torch.Size([3, 1, 23])

In [5]:
# #sample masking

# target = torch.randn(2,3,4)
# query = torch.randn(2,1,4)
# mask = torch.tensor(np.stack([np.array([[True,True,False]]),np.array([[True,False,False]])]))

# attn_dec_layer = MultiHeadAttention(4, 1, dropout_rate=0) 






[23, 10, 2]

## Making the dataset of balanced size

In [29]:
import torch
import numpy as np
import os
import json
import joblib
from torch.utils.data import Dataset,DataLoader
from itertools import repeat
import pandas as pd
import math
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger

In [31]:
#utils

def get_vids(base_dir,split):
    trn_split = base_dir+split
    trn_idlst = []
    trn_vidlst = []

    f = open(trn_split,'r')
    for line in f:
        id_,vid = line.split('/')
        vid = vid.strip('\n')
        trn_idlst.append(id_)
        trn_vidlst.append(vid)
        #print(vid)
        #break
    f.close()
    return trn_idlst,trn_vidlst

    
def get_features(data_dir,split='val',feat_dir='/common/users/vk405/feat_csv/'):
    #feat_dir = data_dir
    splits_dir = data_dir+'splits/'
    if split == 'val':
        feat_split_dir = feat_dir+'val_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'val_list.txt')  
    elif split == 'train':
        feat_split_dir = feat_dir+'train_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'train_list.txt') 
    elif split == 'test':
        feat_split_dir = feat_dir+'test_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'test_list.txt')
    else:
        raise NotImplementedError(f'unknown split: {split}')     
    feat_list = {}
    vid_dtls = []
    for num,name in zip(vid_num,vid_name):
        feat_loc = os.path.join(feat_split_dir, f'{num}/{name}/0001/')
        #import pdb;pdb.set_trace()
        if os.path.isdir(feat_loc):
            feat_files = feat_loc + os.listdir(feat_loc)[0]
            feat_list[name] = feat_files
            #feat_list.append(feat_files)
            vid_dtls.append((num,name))
        else:
            print(f"video : {num}/{name} not found")
    assert len(feat_list) == len(vid_dtls),"get-features is giving incorrect features"
    return feat_list,vid_dtls






def get_raw_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns['database']:
                #import pdb;pdb.set_trace()
                duration = annotns['database'][vidname]['duration']
                annot = annotns['database'][vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    sent = segment_info['sentence']
                    labels.append((interval,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info

def regress_labels(raw_labels):
    regress_labels = {}
    for key in raw_labels:
        new_labels = []
        for item in raw_labels[key]:
            rng,sent,vidlen = item
            mid = sum(rng)/2
            duration = rng[-1]-rng[0]
            mid_pred = (1/vidlen)*mid # location of mid-point w.r.t video length
            duration_pred = (1/vidlen)*duration
            new_labels.append(([mid_pred,duration_pred],sent))
        regress_labels[key] = new_labels
    return regress_labels
            
            
    
    

def get_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        #print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns:
                #import pdb;pdb.set_trace()
                duration = annotns[vidname]['duration']
                annot = annotns[vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    st_end = [interval[0],interval[-1]]
                    sent = segment_info['sentence']
                    labels.append((st_end,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info
    


In [32]:


#dataset
# Dataset/loader
# This is newer version
class YoucookDset2(Dataset):
    def __init__(self,data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'\
        ,split='train',use_precomp_emb=True,seqlen=26,framecnt=499,id=0):
        self.id = id
        self.feat_locs = {}
        self.split = split
        self.data_dir = data_dir
        self.use_precomp_emb = use_precomp_emb
        self.text_emb = None
        self.seqlen = seqlen
        self.framecnt = framecnt
        if self.split != 'test':
            self.annotns_file = data_dir+'annotations/segment_youcookii_annotations_trainval.json'
        else:
            raise NotImplementedError(f"Split:{self.split},not yet correctly implemented")
        if self.use_precomp_emb:
            self.txt_emb = joblib.load(os.path.join(self.data_dir,'emb.joblib'))
        #feat_locs = {'Ysh60eirChU': location of the video}
        self.feat_locs,vids = get_features(self.data_dir,split=self.split)
        assert len(vids) == len(self.feat_locs),"features are wrong"
        #import pdb;pdb.set_trace()
        label_info = get_labels(vids,self.annotns_file)
        #self.labelencoder = LabelEncoder2()
        self.final_labels = label_info
        #self.labelencoder.fit_transform(label_info)
        
        #regress_labels(label_info)
        #(vid_id,seg_id)
        self.update_data()

                
            
    def __len__(self):
        return len(self.data)

    def update_data(self,id=None):
        self.data = []
        #self.vid_len = []
        if not id:
            id = self.id
            
        starting_pnt = np.arange(id,self.framecnt,self.seqlen)

        for key in self.final_labels:
            annot_len = len(self.final_labels[key])
            if key in self.feat_locs:
                file_loc = self.feat_locs[key]
                #for stpnt in starting_pnt:
                segments = list(zip(repeat(key,annot_len),repeat(file_loc,annot_len),\
                        range(annot_len)))
                for seg in segments:
                    for stpnt in starting_pnt:
                        if stpnt+self.seqlen<=self.framecnt:
                            datapnt = seg[:-1]+(stpnt,)+seg[-1:]
                            self.data.append(datapnt)
                    
                #self.data.extend(segments)
            else:
                print(f"video:{key} not found")


    def getclass_prob(self,lbl_rng,frame_rng):
        lbl_ids = set(np.arange(lbl_rng[0],lbl_rng[-1]+1))
        frame_ids = set(np.arange(frame_rng[0],frame_rng[-1]+1))
        inter = lbl_ids.intersection(frame_ids)
        if len(inter) == 0:
            return 0.0
        else:
            return len(inter)/len(lbl_ids.union(frame_ids))

    def __getitem__(self,idx):
        if self.use_precomp_emb:
            vidname,file_loc,stid,seg_ind = self.data[idx]
            #import pdb;pdb.set_trace()
            #self.txt_emb[vidname][seg_ind],
            txt_info = self.final_labels[vidname][seg_ind]
            label_value = self.getclass_prob(txt_info[0],(stid,stid+self.seqlen-1))
            #import pdb;pdb.set_trace()
            return pd.read_csv(file_loc).values.astype(np.float32)[stid:stid+self.seqlen,:],(self.txt_emb[vidname][seg_ind]).astype(np.float32),\
                label_value
            #np.array(self.final_labels[vidname][seg_ind][0],dtype=np.float32)
        else:
            raise NotImplementedError("not yet correctly implemented")

        

           



        

In [33]:
youcookdata = YoucookDset2()

In [37]:
youcookdata.final_labels['Ysh60eirChU']

[([98, 102], 'heat 2 tbsp ghee in a pan', 609.97),
 ([105, 162],
  'add cinnamon bay leaves green cardamoms black cardamoms green chillies and saute',
  609.97),
 ([182, 200], 'add onions and saute for 3-4 minutes', 609.97),
 ([215, 245], 'add mutton and saute for 2 minutes', 609.97),
 ([247, 252], 'add ginger-garlic paste and mix well', 609.97),
 ([253, 281],
  'add salt 2 cup water and cover to pressure cook on high heat for 5 minutes',
  609.97),
 ([323, 350],
  'heat crushed peppercorns and cashew nut paste with remaining ghee in a pan',
  609.97),
 ([389, 409],
  'add the cooked mutton with stock and spices and mix everything well',
  609.97),
 ([416, 433], 'add garam masala powder cream and stir to mix', 609.97),
 ([466, 469], 'sprinkle crushed peppercorns on top and serve', 609.97)]

In [45]:
youcookdata.data[0]

('Ysh60eirChU',
 '/common/users/vk405/feat_csv/train_frame_feat_csv/405/Ysh60eirChU/0001/resnet_34_feat_mscoco.csv',
 0,
 0)

In [47]:
youcookdata.feat_locs['Ysh60eirChU']

'/common/users/vk405/feat_csv/train_frame_feat_csv/405/Ysh60eirChU/0001/resnet_34_feat_mscoco.csv'

In [40]:
import pandas as pd

spl = pd.read_csv('/common/users/vk405/feat_csv/train_frame_feat_csv/405/Ysh60eirChU/0001/resnet_34_feat_mscoco.csv')

In [42]:
spl.shape

(499, 512)

In [44]:
youcookdata.data[1]

('Ysh60eirChU',
 '/common/users/vk405/feat_csv/train_frame_feat_csv/405/Ysh60eirChU/0001/resnet_34_feat_mscoco.csv',
 26,
 0)

In [49]:
st = 10
end = 50


In [64]:
from collections import namedtuple
#Point = namedtuple("vid_id", "vid_loc","start","end","segid","label")

def overlap_frac(base_rng,tst_rng):
    #1.Returns the fraction of frames that are overlapping in tst_rng with base_rng
    #2.both ends inclusive
    sz = tst_rng[-1]-tst_rng[0]+1
    lbl_ids = set(np.arange(base_rng[0],base_rng[-1]+1))
    frame_ids = set(np.arange(tst_rng[0],tst_rng[-1]+1))
    inter = frame_ids.intersection(lbl_ids)
    assert sz != 0,"base frame rng is zero"
    return len(inter)/sz
    

data = []
max_cnt = 50
for key in youcookdata.final_labels:
    segments = youcookdata.final_labels[key]
    for ind,seg in enumerate(segments):
        #trn_points = []
        st_end,txt,vid_len = seg
        main_seg = (key,youcookdata.feat_locs[key],st_end[0],st_end[-1],ind,1.0)
        data.append(main_seg)
        frame_width = st_end[-1]-st_end[0] + 1
        extra_frames = []
        for cnt,new_st in enumerate(range(st_end[0]+1,st_end[-1]+1)):
            #forward sliding
            new_end = new_st+frame_width
            if (cnt<max_cnt)and (0<=new_st<youcookdata.framecnt and 0<=new_st<youcookdata.framecnt):
                extra_frames.append((new_st,new_end))
        for cnt,new_end in enumerate(range(st_end[-1],st_end[0],-1)):
            #backward sliding
            new_st = new_end-frame_width
            if (cnt<max_cnt)and (0<=new_st<youcookdata.framecnt and 0<=new_st<youcookdata.framecnt):
                extra_frames.append((new_st,new_end))
        #import pdb;pdb.set_trace()
        for ex_seg in extra_frames:
            label = overlap_frac(st_end,ex_seg)
            data.append((key,youcookdata.feat_locs[key],ex_seg[0],ex_seg[-1],ind,label))


        

        

        
        

    

In [68]:
len(data)

573897

In [70]:
label_dist = [ele[-1] for ele in data]

In [71]:

df = pd.DataFrame({'percentile_ind':np.linspace(0,100,25),
 'percentile':[np.percentile(label_dist,p) for p in np.linspace(0,100,25)]})

In [73]:
# all values are equally distributed
#df

In [103]:


#dataset
# Dataset/loader
# This is newer version
class YoucookDset2(Dataset):
    def __init__(self,data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'\
        ,split='train',use_precomp_emb=True,seqlen=26,framecnt=499,id=0):
        self.id = id
        self.feat_locs = {}
        self.split = split
        self.data_dir = data_dir
        self.use_precomp_emb = use_precomp_emb
        self.text_emb = None
        self.seqlen = seqlen
        self.framecnt = framecnt
        if self.split != 'test':
            self.annotns_file = data_dir+'annotations/segment_youcookii_annotations_trainval.json'
        else:
            raise NotImplementedError(f"Split:{self.split},not yet correctly implemented")
        if self.use_precomp_emb:
            self.txt_emb = joblib.load(os.path.join(self.data_dir,'emb.joblib'))
        #feat_locs = {'Ysh60eirChU': location of the video}
        self.feat_locs,vids = get_features(self.data_dir,split=self.split)
        assert len(vids) == len(self.feat_locs),"features are wrong"
        #import pdb;pdb.set_trace()
        #label_info = get_labels(vids,self.annotns_file)
        #self.labelencoder = LabelEncoder2()
        self.final_labels = get_labels(vids,self.annotns_file)
        #self.labelencoder.fit_transform(label_info)
        
        #regress_labels(label_info)
        #(vid_id,seg_id)
        self.data = self.update_data()

                
            
    def __len__(self):
        return len(self.data)

    def overlap_frac(self,base_rng,tst_rng):
        #1.Returns the fraction of frames that are overlapping in tst_rng with base_rng
        #2.both ends inclusive
        sz = tst_rng[-1]-tst_rng[0]+1
        lbl_ids = set(np.arange(base_rng[0],base_rng[-1]+1))
        frame_ids = set(np.arange(tst_rng[0],tst_rng[-1]+1))
        inter = frame_ids.intersection(lbl_ids)
        assert sz != 0,"base frame rng is zero"
        return len(inter)/sz


    def update_data(self):
        data = []
        max_cnt = 50
        for key in self.final_labels:
            segments = self.final_labels[key]
            for ind,seg in enumerate(segments):
                #trn_points = []
                st_end,txt,vid_len = seg
                main_seg = (key,self.feat_locs[key],st_end[0],st_end[-1],ind,1.0)
                data.append(main_seg)
                frame_width = st_end[-1]-st_end[0] + 1
                extra_frames = []
                for cnt,new_st in enumerate(range(st_end[0]+1,st_end[-1]+1)):
                    #forward sliding
                    new_end = new_st+frame_width
                    if (cnt<max_cnt)and (0<=new_st<self.framecnt and 0<=new_st<self.framecnt):
                        extra_frames.append((new_st,new_end))
                for cnt,new_end in enumerate(range(st_end[-1],st_end[0],-1)):
                    #backward sliding
                    new_st = new_end-frame_width
                    if (cnt<max_cnt)and (0<=new_st<self.framecnt and 0<=new_st<self.framecnt):
                        extra_frames.append((new_st,new_end))
                #import pdb;pdb.set_trace()
                for ex_seg in extra_frames:
                    label = self.overlap_frac(st_end,ex_seg)
                    data.append((key,self.feat_locs[key],ex_seg[0],ex_seg[-1],ind,label))
        return data

    def __getitem__(self,idx):
        return self.data[idx]
        

           



        

In [104]:
ydata = YoucookDset2()

In [105]:
ydata[0]

('Ysh60eirChU',
 '/common/users/vk405/feat_csv/train_frame_feat_csv/405/Ysh60eirChU/0001/resnet_34_feat_mscoco.csv',
 98,
 102,
 0,
 1.0)

In [75]:
from torch.utils.data import Dataset,TensorDataset
inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
dataset = TensorDataset(inps, tgts)

In [99]:
class ToyDataset(Dataset):
    def __init__(self):
        self.data = ['h','l','e','o']
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        return self.data[idx]

dataset = ToyDataset()

In [86]:
len(dataset)

4

In [168]:
from collections import defaultdict
data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'
global_txt = joblib.load(os.path.join(data_dir,'emb.joblib'))
def collate_wrapper(data):
    labels = []
    vid_embs = []
    txt_embs = []
    #namedtuple("vid_id", "vid_loc","start","end","segid","label")
    batched_data = pd.DataFrame(data,columns=["vid_id", "vid_loc","start","end","segid","label"])
    unique_locs = batched_data['vid_loc'].unique()
    for loc in unique_locs:
        locwise = batched_data[batched_data['vid_loc']==loc]
        tot_vid = pd.read_csv(loc).values
        txtemb = None
        for ind,ele in locwise.iterrows():
            vid_id,_,st,end,segid,label = ele
            #import pdb;pdb.set_trace()
            if not txtemb:
                txtemb = global_txt[ele['vid_id']]
            vid_embs.append(torch.tensor(tot_vid[ele['start']:ele['end']+1]))
            txt_embs.append(torch.tensor(txtemb[ele['segid']]))
            labels.append(torch.tensor(ele['label']))
    #return (np.stack(vid_embs),np.stack(txt_embs)),np.stack(labels)
    return (vid_embs,txt_embs),labels
        
   

In [169]:
toy = [ydata[i] for i in range(5)]

In [170]:
out = collate_wrapper(toy)

In [166]:
out[0][1][4].shape

(768,)

In [171]:
vid_o = pad_sequence(out[0][0],batch_first=True)

In [172]:
vid_o.shape

torch.Size([5, 6, 512])

In [135]:
len(global_txt['GLd3aX16zBg'])

5

In [101]:
loader = DataLoader(ydata, batch_size=2, collate_fn=collate_wrapper)

In [146]:
d = [(1,'st'),(1,'john'),(3,'st')]

out = pd.DataFrame(d,columns=['cnt','str'])

In [147]:
out

Unnamed: 0,cnt,str
0,1,st
1,1,john
2,3,st


In [152]:
for id,ele in out.iterrows():
    print(ele['str'])

st
john
st


In [131]:
ydata[0][1].split('/')[-3]

'Ysh60eirChU'

In [123]:
a = np.array([0,8,6])

In [125]:
a[[0,0,1,1,2]]

array([0, 0, 8, 8, 6])

In [112]:
out

Unnamed: 0,cnt,str
0,1,st
1,1,john
2,3,st


In [98]:
batch

[None, None]

In [None]:
class SimpleCustomBatch:
    def __init__(self, data):
        transposed_data = list(zip(*data))
        self.inp = torch.stack(transposed_data[0], 0)
        self.tgt = torch.stack(transposed_data[1], 0)

    # custom memory pinning method on custom type
    def pin_memory(self):
        self.inp = self.inp.pin_memory()
        self.tgt = self.tgt.pin_memory()
        return self

def collate_wrapper(batch):
    return SimpleCustomBatch(batch)


loader = DataLoader(dataset, batch_size=2, collate_fn=collate_wrapper,
                    pin_memory=True)

In [52]:
def overlap_frac(base_rng,tst_rng):
    #1.Returns the fraction of frames that are overlapping in tst_rng with base_rng
    #2.both ends inclusive
    sz = tst_rng[-1]-tst_rng[0]+1
    lbl_ids = set(np.arange(base_rng[0],base_rng[-1]+1))
    frame_ids = set(np.arange(tst_rng[0],tst_rng[-1]+1))
    inter = frame_ids.intersection(lbl_ids)
    assert sz != 0,"base frame rng is zero"
    return len(inter)/sz
    

In [53]:
overlap_frac((10,20),(15,25))

{15, 16, 17, 18, 19, 20}


0.5454545454545454