In [2]:
import torch
import numpy as np
import os
import json
import joblib
from torch.utils.data import Dataset,DataLoader
from itertools import repeat
import pandas as pd
import math
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# testing gpu
x_big = torch.randn(100000)
x_big_gpu = x_big.cuda()

In [3]:
x_big_gpu

tensor([-5.3268e-01,  8.9664e-01, -1.9666e-04,  ...,  1.0013e+00,
        -7.8356e-01, -4.9873e-01], device='cuda:0')

In [3]:
#utils

def get_vids(base_dir,split):
    trn_split = base_dir+split
    trn_idlst = []
    trn_vidlst = []

    f = open(trn_split,'r')
    for line in f:
        id_,vid = line.split('/')
        vid = vid.strip('\n')
        trn_idlst.append(id_)
        trn_vidlst.append(vid)
        #print(vid)
        #break
    f.close()
    return trn_idlst,trn_vidlst

    
def get_features(data_dir,split='val',feat_dir='/common/users/vk405/feat_csv/'):
    #feat_dir = data_dir
    splits_dir = data_dir+'splits/'
    if split == 'val':
        feat_split_dir = feat_dir+'val_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'val_list.txt')  
    elif split == 'train':
        feat_split_dir = feat_dir+'train_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'train_list.txt') 
    elif split == 'test':
        feat_split_dir = feat_dir+'test_frame_feat_csv/'  
        vid_num,vid_name = get_vids(splits_dir,'test_list.txt')
    else:
        raise NotImplementedError(f'unknown split: {split}')     
    feat_list = {}
    vid_dtls = []
    for num,name in zip(vid_num,vid_name):
        feat_loc = os.path.join(feat_split_dir, f'{num}/{name}/0001/')
        #import pdb;pdb.set_trace()
        if os.path.isdir(feat_loc):
            feat_files = feat_loc + os.listdir(feat_loc)[0]
            feat_list[name] = feat_files
            #feat_list.append(feat_files)
            vid_dtls.append((num,name))
        else:
            print(f"video : {num}/{name} not found")
    assert len(feat_list) == len(vid_dtls),"get-features is giving incorrect features"
    return feat_list,vid_dtls






def get_raw_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns['database']:
                #import pdb;pdb.set_trace()
                duration = annotns['database'][vidname]['duration']
                annot = annotns['database'][vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    sent = segment_info['sentence']
                    labels.append((interval,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info

def regress_labels(raw_labels):
    regress_labels = {}
    for key in raw_labels:
        new_labels = []
        for item in raw_labels[key]:
            rng,sent,vidlen = item
            mid = sum(rng)/2
            duration = rng[-1]-rng[0]
            mid_pred = (1/vidlen)*mid # location of mid-point w.r.t video length
            duration_pred = (1/vidlen)*duration
            new_labels.append(([mid_pred,duration_pred],sent))
        regress_labels[key] = new_labels
    return regress_labels
            
            
    
    
    


In [152]:
from sklearn.base import BaseEstimator, TransformerMixin

class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vidlens = []
        self.truebounds = []

    def fit(self,raw_labels):
        l = []
        for key in raw_labels:
            vid_len = raw_labels[key][0][-1]
            sz = len(raw_labels[key])
            for i in range(sz):l.append(vid_len)
        self.vidlens = np.array(l)
        return self
        
    def transform(self,raw_labels):
        regress_labels = self._regress_labels(raw_labels)
        return regress_labels

    def decode(self,outputs):
        cent_wid = np.expand_dims(self.vidlens,1)*outputs
        width = cent_wid[:,-1]
        center = cent_wid[:,0]
        left = center - width//2
        right = center + (width-(width//2))
        return np.concatenate([np.expand_dims(left,1),np.expand_dims(right,1)],1)


    def _regress_labels(self,raw_labels):
        regress_labels = {}
        bounds = []
        for key in raw_labels:
            new_labels = []
            for item in raw_labels[key]:
                rng,sent,vidlen = item
                bounds.append(rng)
                mid = sum(rng)/2
                duration = rng[-1]-rng[0]
                mid_pred = (1/vidlen)*mid # location of mid-point w.r.t video length
                duration_pred = (1/vidlen)*duration
                new_labels.append(([mid_pred,duration_pred],sent))
            regress_labels[key] = new_labels
        self.truebounds = np.array(bounds)
        return regress_labels


In [7]:
lenc = LabelEncoder()


In [64]:
out_labels = lenc.fit_transform(label_info)

In [65]:
sam = np.random.randn(10337,2)
dec = lenc.decode(sam)

> [0;32m<ipython-input-62-b6331d5c019f>[0m(27)[0;36mdecode[0;34m()[0m
[0;32m     25 [0;31m        [0mright[0m [0;34m=[0m [0mcenter[0m [0;34m+[0m [0;34m([0m[0mwidth[0m[0;34m-[0m[0;34m([0m[0mwidth[0m[0;34m//[0m[0;36m2[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     26 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m[0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 27 [0;31m        [0;32mreturn[0m [0mnp[0m[0;34m.[0m[0mconcatenate[0m[0;34m([0m[0;34m[[0m[0mnp[0m[0;34m.[0m[0mexpand_dims[0m[0;34m([0m[0mleft[0m[0;34m,[0m[0;36m1[0m[0;34m)[0m[0;34m,[0m[0mnp[0m[0;34m.[0m[0mexpand_dims[0m[0;34m([0m[0mright[0m[0;34m,[0m[0;36m1[0m[0;34m)[0m[0;34m][0m[0;34m,[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     28 [0;31m[0;34m[0m[0m
[0m[0;32m     29 [0;31m[0;34m[0m[0m
[0m


In [66]:
dec.shape

(10337, 2)

## Models

In [4]:
#!pip install transformers

def init_parameters_xavier_uniform(model):
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

def scaled_dot(query, key, mask_key=None):  
    score = torch.matmul(query, key.transpose(-2, -1))
    score /= math.sqrt(query.size(-1))
    if mask_key is not None:
        score = score.masked_fill(mask_key, -1e18)  # Represents negative infinity
    return score      
            
def attend(query, key, value, mask_key=None, dropout=None):
    # TODO: Implement
    # Use scaled_dot, be sure to mask key
    #smax = nn.Softmax(-1)
    #import pdb;pdb.set_trace()
    score = scaled_dot(query,key,mask_key)  
    attention = F.softmax(score,dim=-1)
    if dropout is not None:#do = nn.Dropout(dropout)
        attention = dropout(attention)
    answer = torch.matmul(attention,value) 
    # Convexly combine value embeddings using attention, this should be just a matrix-matrix multiplication.
    return answer, attention



def split_heads(batch, num_heads):  
    (batch_size, length, dim) = batch.size()  # These are the expected batch dimensions.
    assert dim % num_heads == 0  # Assert that dimension is divisible by the number of heads.
    dim_head = dim // num_heads

    # No new memory allocation
    splitted = batch.view(batch_size, -1, num_heads, dim_head).transpose(1, 2)  
    return splitted  # (batch_size, num_heads, length, dim_head), note that now the last two dimensions are compatible with our attention functions. 




def merge_heads(batch):  
    (batch_size, num_heads, length, dim_head) = batch.size()  # These are the expected batch dimensions.

    # New memory allocation (reshape), can't avoid.
    merged = batch.transpose(1, 2).reshape(batch_size, -1, num_heads * dim_head)
    return merged  # (batch_size, length, dim)


class MultiHeadAttention(nn.Module):
    def __init__(self, dim, num_heads, dropout_rate=0.1):
        super().__init__()
        assert dim % num_heads == 0

        self.linear_query = nn.Linear(dim, dim)
        self.linear_key = nn.Linear(dim, dim)
        self.linear_value = nn.Linear(dim, dim)
        self.linear_final = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout_rate)

        self.num_heads = num_heads

    def forward(self, query, key, value, mask_key=None, layer_cache=None,
              memory_attention=False):
        """
        INPUT
          query: (batch_size, length_query, dim)
          key: (batch_size, length_key, dim)
          value: (batch_size, length_key, dim_value)
          mask_key: (*, 1, length_key) if queries share the same mask, else
                    (*, length_query, length_key)
          layer_cache: if not None, stepwise decoding (cache of key/value)
          memory_attention: doing memory attention in stepwise decoding?
        OUTPUT
          answer: (batch_size, length_query, dim_value)
          attention: (batch_size, num_heads, length_query, length_key) else
        """
        batch_size = query.size(0)

        query = self.linear_query(query)
        query = split_heads(query, self.num_heads)  # (batch_size, num_heads, -1, dim_head)

        def process_key_value(key, value):  # Only called when necessary.
            key = self.linear_key(key)
            key = split_heads(key, self.num_heads)
            value = self.linear_value(value)
            value = split_heads(value, self.num_heads)
            return key, value

        #import pdb;pdb.set_trace()
        if layer_cache is None:
            key, value = process_key_value(key, value)
        else:
            assert query.size(2) == 1  # Stepwise decoding
            
            if memory_attention:
                if layer_cache['memory_key'] is None:  # One-time calculation
                    key, value = process_key_value(key, value)
                    # (batch_size, num_heads, length_memory, dim)
                    layer_cache['memory_key'] = key
                    layer_cache['memory_value'] = value

                key = layer_cache['memory_key']
                value = layer_cache['memory_value']

            else:  # Self-attention during decoding
                key, value = process_key_value(key, value)
                assert key.size(2) == 1 and value.size(2) == 1
                
                # Append to previous.
                if layer_cache['self_key'] is not None:
                    key = torch.cat((layer_cache['self_key'], key), dim=2)
                    value = torch.cat((layer_cache['self_value'], value), dim=2)
                    
                 # (batch_size, num_heads, length_decoded, dim)
                layer_cache['self_key'] = key  # Recache.
                layer_cache['self_value'] = value
        # Because we've splitted embeddings into heads, we must also split the mask. 
        # And because each query uses the same mask for all heads (we don't use different masking for different heads), 
        # we can specify length 1 for the head dimension.
        if mask_key is not None:  
            mask_key = mask_key.unsqueeze(1)  # (batch_size, 1, -1, length_key)

        answer, attention = attend(query, key, value, mask_key, self.dropout)

        answer = merge_heads(answer)  # (batch_size, length_key, dim)
        answer = self.linear_final(answer)

        return answer, attention

class PositionwiseFeedForward(nn.Module):
    def __init__(self, dim, dim_hidden, drop_rate=0.1):
        super().__init__()
        self.w1 = nn.Linear(dim, dim_hidden)
        self.w2 = nn.Linear(dim_hidden, dim)
        self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
        self.drop1 = nn.Dropout(drop_rate)
        self.relu = nn.ReLU()
        self.drop2 = nn.Dropout(drop_rate)
    def forward(self, x):
        inter = self.drop1(self.relu(self.w1(self.layer_norm(x))))
        output = self.drop2(self.w2(inter))
        return output + x




class SinusoidalPositioner(nn.Module):
    def __init__(self, dim, drop_rate=0.1, length_max=5000):
        super().__init__()
        frequency = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.) / dim))  # Using different frequency for each dim
        positions = torch.arange(0, length_max).unsqueeze(1)
        wave = torch.zeros(length_max, dim)
        wave[:, 0::2] = torch.sin(frequency * positions)
        wave[:, 1::2] = torch.cos(frequency * positions)
        self.register_buffer('wave', wave.unsqueeze(0))  # (1, length_max, dim)
        self.dropout = nn.Dropout(drop_rate)
        self.dim = dim
        self.length_max = length_max
    def forward(self, x, step=-1):
        assert x.size(-2) <= self.length_max

        if step < 0:  # Take the corresponding leftmost embeddings.
            position_encoding = self.wave[:, :x.size(-2), :]
        else:  # Take the embedding at the step.
            position_encoding = self.wave[:, step, :]

        x = x * math.sqrt(self.dim)
        return self.dropout(x + position_encoding)




class TransformerEncoderLayer(nn.Module):

  def __init__(self, dim, num_heads, dim_hidden, drop_rate):
    super().__init__()
    self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
    self.self_attention = MultiHeadAttention(dim, num_heads, drop_rate)
    self.drop = nn.Dropout(drop_rate)
    self.feedforward = PositionwiseFeedForward(dim, dim_hidden, drop_rate)

  def forward(self, source, mask_source=None):
    # TODO: Implement
    #print(source.shape)
    normed = self.layer_norm(source)  
    # Apply layer norm on source

    attended, attention = self.self_attention(normed,normed,normed,mask_source)
    #None, None  # Apply self-attention on normed (be sure to use mask_source).
    attended = self.drop(attended) + source  
    # Re-write attended by applying dropout and adding a residual connection to source.
    return self.feedforward(attended), attention




class CrossAttentionLayer(nn.Module):
    def __init__(self,dim,num_heads,dim_hidden,drop_rate):
        super().__init__()
        self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
        self.context_attention = MultiHeadAttention(dim, num_heads, drop_rate)
        self.drop = nn.Dropout(drop_rate)
        self.feedforward = PositionwiseFeedForward(dim, dim_hidden, drop_rate)
        
    def forward(self,target,memory,layer_cache=None):
        
        cross_attn_target = self.layer_norm(target)
        attended, attention = self.context_attention(cross_attn_target,memory,memory,layer_cache=layer_cache,memory_attention=True)
        
        attended = target + self.drop(attended)
        
        return self.feedforward(attended),attention



layer_cache = {'memory_key': None, 'memory_value': None, 'self_key': None, 'self_value': None}

In [48]:
positioner = SinusoidalPositioner(4, drop_rate=0., length_max=5)

In [49]:
positioner

SinusoidalPositioner(
  (dropout): Dropout(p=0.0, inplace=False)
)

In [147]:
mh_crossatn = CrossAttentionLayer(500,2,50,0.0)

source = torch.randn(1,500,500)
tgt = torch.randn(1,1,500)

In [149]:
output,_ = mh_crossatn(tgt,source)

In [150]:
output.shape

torch.Size([1, 1, 500])

In [79]:
total_params = lambda model : sum(p.numel() for p in model.parameters() if p.requires_grad)
#toomany params for a single layer.
total_params(mh_crossatn)

1002000

# Data-loading

In [5]:
#testing functions
data_dir = '/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'
annotns_file = data_dir+'annotations/youcookii_annotations_trainval.json'

splits_dir = data_dir+'splits/'
splits = ['test_list.txt','train_list.txt','val_list.txt']
trn_feats,trn_vids = get_features(data_dir,split='train')
label_info = get_raw_labels(trn_vids,annotns_file)
final_labels = regress_labels(label_info)

dict_keys(['database'])


In [6]:
segannotns_file = data_dir+'annotations/segment_youcookii_annotations_trainval.json'
def get_labels(ids,annotns_file):

    label_info = {}
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)
        #print(annotns.keys())
        for _,vidname in ids:
            #import pdb;pdb.set_trace()
            if vidname in annotns:
                #import pdb;pdb.set_trace()
                duration = annotns[vidname]['duration']
                annot = annotns[vidname]['annotations']
                labels = []
                #import pdb;pdb.set_trace()
                for segment_info in annot:
                    interval = segment_info['segment']
                    st_end = [interval[0],interval[-1]]
                    sent = segment_info['sentence']
                    labels.append((st_end,sent,duration))

                label_info[vidname] = labels
            else:
                print(f"label for {vidname} not present")
    return label_info

In [156]:
# This returns the labels after using video-frame as label instead of time.
out = get_labels(trn_vids,segannotns_file)

In [157]:
out['Ysh60eirChU']

[([98, 102], 'heat 2 tbsp ghee in a pan', 609.97),
 ([105, 162],
  'add cinnamon bay leaves green cardamoms black cardamoms green chillies and saute',
  609.97),
 ([182, 200], 'add onions and saute for 3-4 minutes', 609.97),
 ([215, 245], 'add mutton and saute for 2 minutes', 609.97),
 ([247, 252], 'add ginger-garlic paste and mix well', 609.97),
 ([253, 281],
  'add salt 2 cup water and cover to pressure cook on high heat for 5 minutes',
  609.97),
 ([323, 350],
  'heat crushed peppercorns and cashew nut paste with remaining ghee in a pan',
  609.97),
 ([389, 409],
  'add the cooked mutton with stock and spices and mix everything well',
  609.97),
 ([416, 433], 'add garam masala powder cream and stir to mix', 609.97),
 ([466, 469], 'sprinkle crushed peppercorns on top and serve', 609.97)]

In [18]:
pd.read_csv(trn_feats['Ysh60eirChU']).values.shape

(499, 512)

In [7]:
# Dataset/loader
# This is older version

class YoucookDset(Dataset):
    def __init__(self,data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'\
        ,split='train',use_precomp_emb=True):
        self.feat_locs = {}
        self.split = split
        self.data_dir = data_dir
        self.use_precomp_emb = use_precomp_emb
        self.text_emb = None
        if self.split != 'test':
            self.annotns_file = data_dir+'annotations/youcookii_annotations_trainval.json'
        else:
            raise NotImplementedError(f"Split:{self.split},not yet correctly implemented")
        if self.use_precomp_emb:
            self.txt_emb = joblib.load(os.path.join(self.data_dir,'emb.joblib'))

        self.feat_locs,vids = get_features(self.data_dir,split=self.split)
        assert len(vids) == len(self.feat_locs),"features are wrong"
        #import pdb;pdb.set_trace()
        label_info = get_raw_labels(vids,self.annotns_file)
        self.labelencoder = LabelEncoder()
        self.final_labels = self.labelencoder.fit_transform(label_info)
        #regress_labels(label_info)
        #(vid_id,seg_id)
        self.data = []
        #self.vid_len = []
        for key in self.final_labels:
            annot_len = len(self.final_labels[key])
            if key in self.feat_locs:
                file_loc = self.feat_locs[key]
                segments = list(zip(repeat(key,annot_len),repeat(file_loc,annot_len),\
                    range(annot_len)))
                self.data.extend(segments)
            else:
                print(f"video:{key} not found")
                
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        if self.use_precomp_emb:
            vidname,file_loc,seg_ind = self.data[idx]
            #import pdb;pdb.set_trace()
            #self.txt_emb[vidname][seg_ind],
            return pd.read_csv(file_loc).values.astype(np.float32),(self.txt_emb[vidname][seg_ind]).astype(np.float32),\
                np.array(self.final_labels[vidname][seg_ind][0],dtype=np.float32)
        else:
            raise NotImplementedError("not yet correctly implemented")


In [40]:
youcookdata = YoucookDset()

dict_keys(['database'])


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class LabelEncoder2(BaseEstimator, TransformerMixin):
    def __init__(self,max_len=499):
        self.vidlens = []
        self.truebounds = []
        self.max_len = max_len

    def fit(self,raw_labels):
        l = []
        for key in raw_labels:
            vid_len = raw_labels[key][0][-1]
            sz = len(raw_labels[key])
            for i in range(sz):l.append(vid_len)
        self.vidlens = np.array(l)
        return self
        
    def transform(self,raw_labels):
        regress_labels = self._regress_labels(raw_labels)
        return regress_labels

    def decode(self,outputs):
        return np.round(outputs*self.max_len)


    def _regress_labels(self,raw_labels):
        regress_labels = {}
        bounds = []
        for key in raw_labels:
            new_labels = []
            for item in raw_labels[key]:
                #import pdb;pdb.set_trace()
                rng,sent,vidlen = item
                new_rng = [rng[0]/self.max_len,rng[-1]/self.max_len]
                bounds.append(rng)
                new_labels.append((new_rng,sent))
            regress_labels[key] = new_labels
        self.truebounds = np.array(bounds)
        return regress_labels


In [108]:
test_enc = LabelEncoder2()

In [109]:
out_trnsfmd = test_enc.fit_transform(out)

In [110]:
out_trnsfmd

{'Ysh60eirChU': [([0.1963927855711423, 0.20440881763527055],
   'heat 2 tbsp ghee in a pan'),
  ([0.21042084168336672, 0.3246492985971944],
   'add cinnamon bay leaves green cardamoms black cardamoms green chillies and saute'),
  ([0.36472945891783565, 0.40080160320641284],
   'add onions and saute for 3-4 minutes'),
  ([0.4308617234468938, 0.4909819639278557],
   'add mutton and saute for 2 minutes'),
  ([0.49498997995991983, 0.5050100200400801],
   'add ginger-garlic paste and mix well'),
  ([0.5070140280561122, 0.56312625250501],
   'add salt 2 cup water and cover to pressure cook on high heat for 5 minutes'),
  ([0.6472945891783567, 0.7014028056112225],
   'heat crushed peppercorns and cashew nut paste with remaining ghee in a pan'),
  ([0.779559118236473, 0.8196392785571143],
   'add the cooked mutton with stock and spices and mix everything well'),
  ([0.8336673346693386, 0.8677354709418837],
   'add garam masala powder cream and stir to mix'),
  ([0.9338677354709419, 0.939879759

In [79]:
test_enc.vidlens

array([609.97, 609.97, 609.97, ..., 156.22, 156.22, 156.22])

In [9]:
# Dataset/loader
# This is newer version
class YoucookDset2(Dataset):
    def __init__(self,data_dir='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/'\
        ,split='train',use_precomp_emb=True):
        self.feat_locs = {}
        self.split = split
        self.data_dir = data_dir
        self.use_precomp_emb = use_precomp_emb
        self.text_emb = None
        if self.split != 'test':
            self.annotns_file = data_dir+'annotations/segment_youcookii_annotations_trainval.json'
        else:
            raise NotImplementedError(f"Split:{self.split},not yet correctly implemented")
        if self.use_precomp_emb:
            self.txt_emb = joblib.load(os.path.join(self.data_dir,'emb.joblib'))

        self.feat_locs,vids = get_features(self.data_dir,split=self.split)
        assert len(vids) == len(self.feat_locs),"features are wrong"
        #import pdb;pdb.set_trace()
        label_info = get_labels(vids,self.annotns_file)
        self.labelencoder = LabelEncoder2()
        self.final_labels = self.labelencoder.fit_transform(label_info)
        
        #regress_labels(label_info)
        #(vid_id,seg_id)
        self.data = []
        #self.vid_len = []
        for key in self.final_labels:
            annot_len = len(self.final_labels[key])
            if key in self.feat_locs:
                file_loc = self.feat_locs[key]
                segments = list(zip(repeat(key,annot_len),repeat(file_loc,annot_len),\
                    range(annot_len)))
                self.data.extend(segments)
            else:
                print(f"video:{key} not found")
                
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        if self.use_precomp_emb:
            vidname,file_loc,seg_ind = self.data[idx]
            #import pdb;pdb.set_trace()
            #self.txt_emb[vidname][seg_ind],
            return pd.read_csv(file_loc).values.astype(np.float32),(self.txt_emb[vidname][seg_ind]).astype(np.float32),\
                np.array(self.final_labels[vidname][seg_ind][0],dtype=np.float32)
        else:
            raise NotImplementedError("not yet correctly implemented")

        

           



        

In [112]:
youcookdata2 = YoucookDset2()

In [41]:
len(youcookdata2)

10337

In [44]:
youcookdata2[0][0].shape,youcookdata2[0][1].shape

((499, 512), (768,))

In [45]:
youcookdata[0][0].shape,youcookdata[0][1].shape

((499, 512), (768,))

In [104]:
vid_emb,txt_emb,label = youcookdata[0]

In [57]:
txt_emb.shape,label.shape

((768,), (2,))

In [34]:
youcookvld = YoucookDset(split='val')

dict_keys(['database'])


In [89]:
len(youcookvld)

3492

In [92]:
youcookdl = DataLoader(youcookdata,batch_size=32,shuffle=True)

youcookvld_dl = DataLoader(youcookvld,batch_size=64,shuffle=False)

In [106]:
trn_feat,trn_wemb,trn_labels = next(iter(youcookdl))

In [None]:
trn_feat.shape,trn_wemb.shape,trn_labels.shape
trn_wemb.unsqueeze_(1)


In [None]:
trn_feat.transpose_(1,2)

In [None]:
trn_wemb.transpose_(1,2)

In [110]:
trn_feat.shape,trn_wemb.shape

(torch.Size([32, 512, 499]), torch.Size([32, 768, 1]))

In [111]:
wrdcnn.weight.dtype,trn_feat.dtype

(torch.float32, torch.float64)

In [112]:
wrdcnn = nn.Conv1d(768, 100, 1, stride=1)
vidcnn = nn.Conv1d(512,100,1,stride=1)

trn_feat_red = vidcnn(trn_feat.float())
trn_wemb_red = wrdcnn(trn_wemb.float())

In [None]:
trn_feat_red.transpose_(1,2),trn_wemb_red.transpose_(1,2)

In [118]:
trn_feat_red.shape

torch.Size([32, 499, 100])

In [117]:
trn_wemb_red.shape

torch.Size([32, 1, 100])

In [120]:
mlhattn = MultiHeadAttention(100,10,dropout_rate=0.0)

In [121]:
out,attn = mlhattn(trn_wemb_red,trn_feat_red,trn_feat_red)

In [122]:
out.shape

torch.Size([32, 1, 100])

In [125]:
total_params(mlhattn)

40400

In [12]:
from argparse import Namespace
hparams = Namespace(
    edim = 100,
    attnhdim = 50,
    nheads = 10,
    wrdim = 768,
    vidim = 512,
    hdim = 30,
    dropoutp=0.1

)

In [30]:
class CrossattnModel(pl.LightningModule):
    def __init__(self,hparams):
        super().__init__()
        self.save_hyperparameters(hparams)
        #self.hparams = hparams
        #import pdb;pdb.set_trace()
        self.positioner = SinusoidalPositioner(self.hparams.edim, drop_rate=0., length_max=1000)
        self.attn = CrossAttentionLayer(self.hparams.edim,self.hparams.nheads,\
                           self.hparams.attnhdim,self.hparams.dropoutp)
        self.wrdcnn =  nn.Conv1d(self.hparams.wrdim, self.hparams.edim, 1, stride=1)
        self.vidcnn =  nn.Conv1d(self.hparams.vidim, self.hparams.edim, 1, stride=1)
        self.hid_layer = nn.Linear(self.hparams.edim,self.hparams.hdim)
        self.out_layer = nn.Linear(self.hparams.hdim,2)
        self.init_parameters_xavier_uniform()

    def forward(self,x):
        #keep this for inference
        out = self._model(x)
        return out
        
    def _model(self,x):
        vid_x,wrd_x = x
        #import pdb;pdb.set_trace()
        wrd_x = wrd_x.unsqueeze(1).transpose(1,2)
        vid_x = vid_x.transpose(1,2)
        #print(f"inside model, wrd_x:{wrd_x.shape},vi")
        tgt = self.wrdcnn(wrd_x.float()).transpose(1,2)
        src = self.vidcnn(vid_x.float()).transpose(1,2)
        src_posencode = self.positioner(src)
        attended,attn_score = self.attn(tgt,src_posencode)
        out = self.out_layer(F.relu(self.hid_layer(F.relu(attended))))
        return out


    def training_step(self,batch,batch_idx):
        #for tranining
        vid_feat,wrd_feat,labels = batch
        x_hat = self._model((vid_feat.float(),wrd_feat.float()))
        #import pdb;pdb.set_trace()
        loss = F.mse_loss(x_hat.squeeze().float(), labels.squeeze().float())
        #print(f"inside train step, loss:{loss}")
        self.log("train_loss",loss,on_step=True)
        return loss

    def validation_step(self,batch,batch_idx):
        #for validation
        vid_feat,wrd_feat,labels = batch
        x_hat = self._model((vid_feat.float(),wrd_feat.float()))
        #import pdb;pdb.set_trace()
        loss = F.mse_loss(x_hat.squeeze().float(), labels.squeeze().float())
        #print(f"inside train step, loss:{loss}")
        self.log("val_loss",loss,on_step=False, on_epoch=True)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    
    def init_parameters_xavier_uniform(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)




In [13]:
model = CrossattnModel(hparams)

In [14]:
log_dir = '/common/home/vk405/Projects/Crossmdl/nbs/lightning_logs/'

In [15]:
youcookdata2 = YoucookDset2()
youcookvld2 = YoucookDset2(split='val')

In [241]:
len(youcookdata2)

10337

In [22]:
import torch.utils.data as data_utils
indices = torch.arange(1000)
youcookdata2subset = data_utils.Subset(youcookdata2, indices)
youcookdl = DataLoader(youcookdata2subset,batch_size=64,shuffle=True,num_workers=10)


In [23]:

with torch.no_grad():
    out = []
    labels = []
    for batch in youcookdl:
        x_vid,x_wemb,label = batch
        labels.append(label)
        predictions = model((x_vid,x_wemb))
        p = predictions.squeeze().detach().cpu().numpy()
        out.append(p)


In [26]:
# initial error

preds = np.concatenate(out,0)
true_labels = np.concatenate(labels,0)


In [29]:
#
np.mean(np.sum((preds-true_labels)**2,1))

34.361347

In [31]:
from pytorch_lightning.loggers import CSVLogger
version = '4'
csvlogger = CSVLogger(log_dir,version)
trainer = pl.Trainer(logger= csvlogger,\
    gpus=1,max_epochs=15)
#youcoodvld_dl = DataLoader(youcookvld2,batch_size=64,shuffle=False,num_workers=10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [32]:
trainer.fit(model,youcookdl)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name       | Type                 | Params
----------------------------------------------------
0 | positioner | SinusoidalPositioner | 0     
1 | attn       | CrossAttentionLayer  | 51.0 K
2 | wrdcnn     | Conv1d               | 76.9 K
3 | vidcnn     | Conv1d               | 51.3 K
4 | hid_layer  | Linear               | 3.0 K 
5 | out_layer  | Linear               | 62    
----------------------------------------------------
182 K     Trainable params
0         Non-trainable params
182 K     Total params
0.729     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 14: 100%|██████████| 16/16 [00:19<00:00,  1.21s/it, loss=0.0672, v_num=0]


In [33]:
!ls /common/home/vk405/Projects/Crossmdl/nbs/lightning_logs/4/version_0/checkpoints

'epoch=14-step=239.ckpt'


In [34]:
ckpt = torch.load("/common/home/vk405/Projects/Crossmdl/nbs/lightning_logs/4/version_0/checkpoints/epoch=14-step=239.ckpt")
model.load_state_dict(ckpt["state_dict"])

<All keys matched successfully>

In [35]:

with torch.no_grad():
    out = []
    labels = []
    for batch in youcookdl:
        x_vid,x_wemb,label = batch
        labels.append(label)
        predictions = model((x_vid,x_wemb))
        p = predictions.squeeze().detach().cpu().numpy()
        out.append(p)


In [36]:

# final error

preds = np.concatenate(out,0)
true_labels = np.concatenate(labels,0)


In [38]:
np.sum((preds-true_labels)**2)/(2*len(preds))

0.06767554473876954

In [41]:
true_labels

array([[0.78957915, 0.82965934],
       [0.45691383, 0.47895792],
       [0.70541084, 0.7875751 ],
       ...,
       [0.4228457 , 0.45891783],
       [0.81963927, 0.83967936],
       [0.58717436, 0.6192385 ]], dtype=float32)

In [42]:
preds

array([[0.4876294 , 0.55848485],
       [0.8712342 , 0.55577457],
       [0.24616435, 0.48584783],
       ...,
       [0.5521641 , 0.5670044 ],
       [0.5802253 , 0.3969652 ],
       [0.70115817, 0.5429062 ]], dtype=float32)

In [43]:
def compare(true_bounds,pred_bounds):
    matches = []
    for ind,(tbnds,pbnds) in enumerate(zip(true_bounds,pred_bounds)):
        tst,tend = tbnds
        if tst<=np.mean(pbnds)<tend:
            matches.append(ind)
    return matches

In [44]:
get_bounds = lambda x : np.round(x*499)

In [46]:
matches = compare(get_bounds(true_labels),get_bounds(preds))

In [57]:
np.mean(np.mean(true_labels,axis=1))

0.51897794

In [89]:
import math
def get_acc(true_bounds,pred_bounds):
    accs = []
    def get_preds(x):
        x = np.clip(x,0,499)
        if x[1]<x[0]:
            return None
        else:
            return np.concatenate([np.zeros(x[0]),np.ones(x[1]-x[0]+1),np.zeros(499-x[1])])
    for ind,(tbnds,pbnds) in enumerate(zip(true_bounds,pred_bounds)):
        tbnds = tbnds.astype('int')
        pbnds = pbnds.astype('int')
        #import pdb;pdb.set_trace()
    
        pcls = get_preds(pbnds)
        tcls = get_preds(tbnds)
        if isinstance(pcls,np.ndarray) and isinstance(tcls,np.ndarray):
            accs.append(np.mean(tcls==pcls))
    return np.mean(accs)
        
    

In [90]:
get_acc(get_bounds(true_labels),get_bounds(preds))

0.8094162348877375

In [62]:
np.zeros(4),np.ones(1)

array([1., 1., 1., 1.])

In [51]:
preds[17]

array([0.5242556, 0.5321824], dtype=float32)

In [168]:
import pandas as pd
logfile = pd.read_csv('/common/home/vk405/Projects/Crossmdl/nbs/lightning_logs/2/version_0/metrics.csv')

In [233]:
logfile['train_loss'].dropna()

0     0.324028
1     0.135907
2     0.130756
4     0.079497
5     0.096190
6     0.058250
8     0.079140
9     0.070054
10    0.051419
12    0.061788
13    0.046083
14    0.059169
16    0.073177
17    0.056551
18    0.066284
19    0.057991
21    0.054277
22    0.047977
23    0.042487
25    0.054313
26    0.050810
27    0.056804
29    0.056412
30    0.050896
31    0.058703
33    0.056551
34    0.048747
35    0.047593
36    0.049425
38    0.052489
39    0.049198
40    0.047924
Name: train_loss, dtype: float64

In [170]:
logfile['val_loss'].dropna()

3     0.077299
7     0.061836
11    0.056311
15    0.055417
20    0.054449
24    0.052903
28    0.051294
32    0.049883
37    0.047721
41    0.048531
Name: val_loss, dtype: float64

In [171]:
logfile['epoch'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [92]:

with torch.no_grad():
    out = []
    for batch in youcoodvld_dl:
        x_vid,x_wemb,label = batch
        predictions = model((x_vid,x_wemb))
        p = predictions.squeeze().detach().cpu().numpy()
        out.append(p)


In [93]:
preds = np.concatenate(out,0)

In [129]:
preds

array([[0.37862328, 0.42747056],
       [0.47637266, 0.5185708 ],
       [0.4351844 , 0.48621053],
       ...,
       [0.2904613 , 0.33066776],
       [0.53125274, 0.5993044 ],
       [0.59103453, 0.6241372 ]], dtype=float32)

In [130]:
pred_bounds = youcookvld2.labelencoder.decode(preds)

In [133]:
len(pred_bounds.mean(axis=1))

3492

In [134]:
true_bounds = youcookvld2.labelencoder.truebounds

In [141]:
def compare(true_bounds,pred_bounds):
    matches = []
    for ind,(tbnds,pbnds) in enumerate(zip(true_bounds,pred_bounds)):
        tst,tend = tbnds
        if tst<=np.mean(pbnds)<tend:
            matches.append(ind)
    return matches

In [143]:
out = compare(true_bounds,pred_bounds)

In [210]:
youcookdata2 = YoucookDset2()
youcookdltrn = DataLoader(youcookdata2,batch_size=64,shuffle=False,num_workers=10)



In [199]:
youcookdata2val = YoucookDset2(split='val')
youcookdlval = DataLoader(youcookdata2val,batch_size=64,shuffle=False,num_workers=10)



In [None]:
youcookdata.labelencoder.decode(np.array[[]])

In [234]:
with torch.no_grad():
    out = []
    labels = []
    for batch in youcookdltrn:
        x_vid,x_wemb,label = batch
        labels.append(label)
        predictions = model((x_vid,x_wemb))
        p = predictions.squeeze().detach().cpu().numpy()
        out.append(p)

preds = np.concatenate(out,0)



pred_bounds = youcookdata2.labelencoder.decode(preds)


In [212]:
true_bounds = youcookdata2.labelencoder.decode(np.concatenate(labels,axis=0))


In [213]:
pred_bounds

array([[148., 180.],
       [178., 209.],
       [207., 229.],
       ...,
       [214., 268.],
       [177., 234.],
       [234., 270.]], dtype=float32)

In [214]:
true_bounds

array([[ 98., 102.],
       [105., 162.],
       [182., 200.],
       ...,
       [290., 317.],
       [318., 384.],
       [424., 451.]], dtype=float32)

In [217]:
labels

[tensor([[0.1964, 0.2044],
         [0.2104, 0.3246],
         [0.3647, 0.4008],
         [0.4309, 0.4910],
         [0.4950, 0.5050],
         [0.5070, 0.5631],
         [0.6473, 0.7014],
         [0.7796, 0.8196],
         [0.8337, 0.8677],
         [0.9339, 0.9399],
         [0.0521, 0.1543],
         [0.1563, 0.2144],
         [0.2505, 0.3587],
         [0.3607, 0.4369],
         [0.4449, 0.5150],
         [0.5230, 0.5571],
         [0.5651, 0.5952],
         [0.6954, 0.7635],
         [0.7996, 0.8236],
         [0.3026, 0.3447],
         [0.3467, 0.3808],
         [0.3848, 0.4088],
         [0.4108, 0.4509],
         [0.4649, 0.5130],
         [0.5271, 0.5852],
         [0.6313, 0.6653],
         [0.6733, 0.7335],
         [0.7475, 0.7976],
         [0.8056, 0.8136],
         [0.0862, 0.2585],
         [0.3066, 0.3367],
         [0.3667, 0.4088],
         [0.4489, 0.5030],
         [0.5531, 0.6072],
         [0.2585, 0.2665],
         [0.2786, 0.2946],
         [0.3287, 0.3567],
 

In [223]:
preds.shape

(10337, 2)

In [235]:
labels = np.concatenate(labels,axis=0)

In [237]:
np.mean(np.abs(np.sum(preds-labels**2,axis=1)))

0.4673907

In [209]:
len(out)

378

In [187]:
tbnds

array([[ 98., 180.],
       [144., 180.],
       [203., 254.],
       ...,
       [211., 252.],
       [260., 261.],
       [244., 265.]], dtype=float32)

In [189]:
pred_bounds

array([[ 98., 180.],
       [144., 180.],
       [203., 254.],
       ...,
       [211., 252.],
       [260., 261.],
       [244., 265.]], dtype=float32)

In [None]:
out = compare(true_bounds,pred_bounds)

In [149]:
len(true_bounds)

10337

In [24]:
batch = next(iter(youcookdl))

In [25]:
#trainer.fit(model,youcookdl)
x_vid,x_wemb,label = batch

In [27]:
predictions = model((x_vid,x_wemb))

In [29]:
predictions.shape

torch.Size([64, 1, 2])