In [1]:
# pad your sequences

from torch.nn.utils.rnn import pad_sequence
from pytorch_lightning.callbacks import LearningRateMonitor
import torch
import numpy as np
from PIL import Image
import os
from collections import defaultdict
import json
import joblib
from torch.utils.data import Dataset,DataLoader,random_split
from itertools import repeat
import pandas as pd
import math
import torch.nn as nn
import torch.nn.functional as F
from numpy import linalg as LA
from argparse import Namespace
from numpy import genfromtxt
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau

import pytorch_lightning as pl
import wandb
import logging
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import clip


import wandb
import logging
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import os

logger = logging.getLogger(__name__)
wandb_logger = lambda dir, version: WandbLogger(
    name="wandb", save_dir=dir, version=version
)
csvlogger = lambda dir, version: CSVLogger(dir, name="csvlogs", version=version)
tblogger = lambda dir, version: TensorBoardLogger(dir, name="tblogs", version=version)

def get_loggers(dir,version,lis=["csv"]):
    lgrs = []
    if "wandb" in lis:
        lgrs.append(wandb_logger(dir, version))
    if "csv" in lis:
        lgrs.append(csvlogger(dir, version))
    if "tb" in lis:
        lgrs.append(tblogger(dir, version))
    return lgrs







def get_vid_ids(split='training',\
    annotns_file='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/annotations/youcookii_annotations_trainval.json'):
    # Returns vid_ids corresponding to the split: 'training'/'validation'
    
    vid_lis = []
    with open(annotns_file) as json_file:
        annotns = json.load(json_file)['database']
        for key in annotns:
            if annotns[key]['subset'] == split:
                vid_lis.append(key)
    return vid_lis


def get_split_files(split='training',\
    annotns_file='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/annotations/youcookii_annotations_trainval.json',\
        data_dir = '/common/users/vk405/Youcook/'):
    total_ids = get_vid_ids(split,annotns_file)
    downloaded_ids = set([dir for dir in os.listdir(data_dir) if 'joblib' not in dir])
    vid_locs = []
    sents = {}
    segs = {}
    incomplete = []
    for id in total_ids:
        if id in downloaded_ids:
            vid_loc = data_dir+id + '/'
            if len(os.listdir(vid_loc))>=495:
                vid_locs.append(vid_loc)
                seg = joblib.load(data_dir+f'{id}global_segs.joblib')
                sent = joblib.load(data_dir+f'{id}global_sents.joblib')
                try:
                    sents[id] = sent[id]
                    segs[id] = seg[id]
                except:
                    print(f"{id} is no corresponding global sent/seg")
            else:
                #print(f"{id} has only imgs {len(os.listdir(vid_loc))}")
                incomplete.append(id)
    return vid_locs,segs,sents,incomplete 



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pathlib

FEAT_DIR = pathlib.Path('/common/users/vk405/CLIP_FEAT')
RAWFRAME_DIR = pathlib.Path('/common/users/vk405/Youcook/')

class Dset(Dataset):
    def __init__(self,data_dir,feat_dir,split):
        self.data_dir = data_dir
        self.feat_dir = feat_dir
        self.split = split
        self.vid_ids,self.sents = self.get_ids()
        self.labels = self.getlabels()
        self.sanitycheck()
        self.data = self.getdata()
        


    def sanitycheck(self):
        mis = []
        #import pdb;pdb.set_trace()
        for key in self.labels.keys():
            txt_loc = self.feat_dir/self.split/f'txt_{key}.joblib'
            txt = joblib.load(txt_loc)
            if len(self.labels[key]) == len(self.sents[key]) == len(txt):
                pass
            else:
                print(key)
                mis.append(key)
        print(f"segs are not matching:{mis}")
        for key in mis:
            self.vid_ids.remove(key)
        self.sents = None

        
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.load(self.data[idx])

    def getdata(self):
        data = []
        for id in self.vid_ids:
            segs = self.labels[id]
            #import pdb;pdb.set_trace()
            for i in range(len(segs)):
                data.append((id,i))
        return data

    def load(self,data):
        vid_id,ind = data
        vid_frames_loc = self.feat_dir/self.split/f'vid_{vid_id}.joblib'
        txt_loc = self.feat_dir/self.split/f'txt_{vid_id}.joblib'
        st,end = self.labels[vid_id][ind]
        vid = joblib.load(vid_frames_loc)
        try:
            txt = joblib.load(txt_loc)[ind]
        except:
            import pdb;pdb.set_trace()
        #normalize data
        #import pdb;pdb.set_trace()
        vid = vid/(LA.norm(vid,axis=-1)).reshape(500,1)
        txt = (txt/LA.norm(txt))
        out = np.squeeze(vid@txt.reshape(512,1))
        #regression outputs
        return out,st/499,end/499
         

    def getlabels(self):
        label_dict = {}
        for vidid in self.vid_ids:
            vidloc = self.data_dir/vidid
            segs = self.extract_seg(vidloc)
            label_dict[vidid] = segs
        return label_dict
    
    def extract_seg(self,vid_loc):
        imgs = sorted(os.listdir(vid_loc),key=lambda x: int(x.split('_')[0]))
        segs = defaultdict(list)
        for img in imgs:
            ind,rem = int(img.split('_')[0]),img.split('_')[-1]
            
            if 'n.' not in rem:
                #print(ind,rem)
                seg_id = int(rem.split('.')[0])
                segs[seg_id].append(ind)
                #print(seg_id,ind)
        final_segs = []
        #import pdb;pdb.set_trace()
        segids = sorted(segs.keys())
        for segid in segids:
            final_segs.append((min(segs[segid]),max(segs[segid])))
        return final_segs
        
    def get_ids(self):
        annotns_file='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/annotations/youcookii_annotations_trainval.json'
        data_dir = '/common/users/vk405/Youcook/'
        vid_locs,_,sents,_ = get_split_files('training',annotns_file,data_dir)
        ids = [ele.split('/')[-2] for ele in vid_locs]
        files = set(os.listdir(self.feat_dir/self.split))
        finids = []
        missing = []
        for id in ids:
            if f'vid_{id}.joblib' in files:
                finids.append(id)
            else:missing.append(id)
        print(f"missing:{missing}")
        return finids,sents

        

In [16]:
# annotns_file='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/annotations/youcookii_annotations_trainval.json'
# data_dir = '/common/users/vk405/Youcook/'
# vid_locs,_,sents,_ = get_split_files('training',annotns_file,data_dir)
# mis = []
# for key in d.labels:
#     if len(d.labels[key]) == len(sents[key]):
#         pass
#     else:
#         print(key)
#         mis.append(key)


In [3]:
d  = Dset(RAWFRAME_DIR,FEAT_DIR,'training')

missing:['ukfCQQpZ0k4', 'NK2xHVWojgY', 'mixdagZ-fwI']
cwsDQ7M5OTI
uf65nfh6X2U
segs are not matching:['cwsDQ7M5OTI', 'uf65nfh6X2U']


In [9]:
#d.vid_ids

In [4]:
out,st,end = d[0]

In [8]:
end

0.42084168336673344

In [30]:
#
trn_sz = int(len(d)*0.8)
val_sz = len(d)-trn_sz
trndset,valdset = random_split(d,[trn_sz,val_sz])

In [17]:
class BaselineModel(pl.LightningModule):
    
    def __init__(self,hparams):
        super().__init__()
        self.save_hyperparameters(hparams)
        self.shared = nn.Sequential(nn.Linear(500,250),nn.ReLU(),nn.Linear(250,125),nn.ReLU(),nn.Linear(125,2),nn.Sigmoid())
        
    def forward(self,x):
        #fixing for now
        #torch.squeeze(self.start(self.shared(input)))
        return self.shared(x)

    def giou(self,p,g):
        x1_p,_ = torch.min(p,1)
        x2_p,_ = torch.max(p,1)

        x1_g,_ = torch.min(g,1)
        x2_g,_ = torch.max(g,1)

        x_1_i,_ =  torch.max(torch.stack([x1_g,x1_p],1),1)
        x_2_i,_ = torch.min(torch.stack([x2_g,x2_p],1),1)

        x_1_c,_ = torch.min(torch.stack([x1_p,x1_g],1),1)
        x_2_c,_ = torch.max(torch.stack([x2_p,x2_g],1),1)

        I = x_2_i - x_1_i
        U = (x2_p-x1_p) + (x2_g-x1_g) - I
        AC = x_2_c-x_1_c

        return (I/U) - ((AC-U)/AC),(I/U)
            

    def training_step(self,batch,batch_idx):

        input,st,end = batch
        preds = self(input)
        st_p = preds[:,0]
        diff = preds[:,-1]
        fin_pred = torch.stack([st_p,st_p+diff],-1)
        grounds = torch.stack([torch.squeeze(st),torch.squeeze(end)],1)
        giou,iou = self.giou(fin_pred,grounds)
        loss = torch.mean(-1*giou)

        self.log("train_loss",loss,on_step=True)
        self.log("train_iou",torch.mean(iou),on_step=True)
        return loss
        

    def validation_step(self,batch,batch_idx):

        input,st,end = batch
        preds = self(input)
        st_p = preds[:,0]
        diff = preds[:,-1]
        fin_pred = torch.stack([st_p,st_p+diff],-1)
        grounds = torch.stack([torch.squeeze(st),torch.squeeze(end)],1)
        giou,iou = self.giou(fin_pred,grounds)
        loss = torch.mean(-1*giou)

        self.log("val_loss",loss,on_step=True)
        self.log("val_iou",torch.mean(iou),on_step=True)
        return loss
        
    def configure_optimizers(self):
        
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer

        

In [10]:
# Note that Crossentropy loss is not optimal as the implicit ordering is not 
#considered here.
d = Dset(RAWFRAME_DIR,FEAT_DIR,'training')
trn_sz = int(len(d)*0.8)
val_sz = len(d)-trn_sz
trndset,valdset = random_split(d,[trn_sz,val_sz])
trnl = DataLoader(trndset,batch_size=64,shuffle=True)
vall = DataLoader(valdset,batch_size=64)

missing:['ukfCQQpZ0k4', 'NK2xHVWojgY', 'mixdagZ-fwI']
cwsDQ7M5OTI
uf65nfh6X2U
segs are not matching:['cwsDQ7M5OTI', 'uf65nfh6X2U']


In [50]:
#batch = next(iter(vall))
#for data in d:
    #pass

False

In [18]:
hparams = Namespace(
    lr = 1e-4
)

model = BaselineModel(hparams)
model

BaselineModel(
  (shared): Sequential(
    (0): Linear(in_features=500, out_features=250, bias=True)
    (1): ReLU()
    (2): Linear(in_features=250, out_features=125, bias=True)
    (3): ReLU()
    (4): Linear(in_features=125, out_features=2, bias=True)
    (5): Sigmoid()
  )
)

In [12]:
batch = next(iter(trnl))

In [19]:
feats,st,end = batch

preds = model(feats)


In [20]:
st_p = preds[:,0]
diff = preds[:,-1]
fin_pred = torch.stack([st_p,st_p+diff],-1)
grounds = torch.stack([torch.squeeze(st),torch.squeeze(end)],1)
giou,iou = model.giou(fin_pred,grounds)
loss = torch.mean(-1*giou)

In [21]:
loss

tensor(0.0925, dtype=torch.float64, grad_fn=<MeanBackward0>)

In [34]:
# first_layer = []
# for name,param in model.named_parameters():
#     print(name)
#     print(first_layer.append(param.cpu().detach().numpy()))
#     break



shared.0.weight
None


In [8]:
def run(cfg):
    #pl.seed_everything(cfg.seed)
    dir = cfg.artifacts_loc
    version = str(cfg.version)
    logger_list = get_loggers(dir, version,cfg.loggers)
    cbs = []
    if "early_stop" in cfg.cbs:
        #? does'nt really work atm
        params = cfg.early_stop
        earlystopcb = EarlyStopping(**params, min_delta=0.00, verbose=False)
        cbs.append(earlystopcb)
    if "checkpoint" in cfg.cbs:
        store_path = dir + "ckpts/" + str(cfg.version) + "/"
        isExist = os.path.exists(store_path)
        if not isExist:
            os.makedirs(store_path)
        fname = "{epoch}-{train_loss:.2f}"
        params = cfg.checkpoint
        checkptcb = ModelCheckpoint(**params, dirpath=store_path, filename=fname,save_top_k=3)
        cbs.append(checkptcb)

    #wandb.init(project="videoretrieval", config=cfg)
    if cfg.mode == 'train':
        d = Dset(cfg.RAWFRAME_DIR,cfg.FEAT_DIR,cfg.split)
        trn_sz = int(len(d)*cfg.trn_split)
        val_sz = len(d)-trn_sz
        trndset,valdset = random_split(d,[trn_sz,val_sz])
        trnl = DataLoader(trndset,batch_size=64,shuffle=True)
        vall = DataLoader(valdset,batch_size=64)
        hparams = cfg    
        net = BaselineModel(hparams)
        trainer = pl.Trainer(
            logger=logger_list,callbacks=cbs,accelerator='gpu',devices=[0,4],deterministic=True, **cfg.trainer
        )
        trainer.fit(net, trnl,vall)
        return trainer
        #trainer.tune(net,train_loader)
            
    else:
        pass
    

In [9]:
from argparse import Namespace
FEAT_DIR = pathlib.Path('/common/users/vk405/CLIP_FEAT')
RAWFRAME_DIR = pathlib.Path('/common/users/vk405/Youcook/')

cfg = Namespace(
    version = 'clip',
    id = 0,
    FEAT_DIR = FEAT_DIR,
    RAWFRAME_DIR = RAWFRAME_DIR,
    artifacts_loc = "/common/home/vk405/Projects/Crossmdl/nbs/",
    data_dir = "/common/home/vk405/Projects/Crossmdl/Data/YouCookII/",
    trn_split = 0.8,
    mode = 'train',
    split = 'training',
    loggers = ["csv"],
    seed = 0,
    cbs = ["checkpoint","early_stop"],
    trainer = {'log_every_n_steps': 1,
    'max_epochs': 100},
    checkpoint = {"every_n_epochs": 1,
    "monitor": "val_loss"},
    early_stop = {"monitor":"val_loss","mode":"min","patience":5},
    lr = 1e-4

)

In [1]:
#run(cfg)

In [None]:
#inference

class Inference():
    def __init__(self,data_dir,feat_dir,split):
        self.data_dir = data_dir
        self.feat_dir = feat_dir
        self.split = split
        
        
    def load(self,data):
        vid_id,ind = data
        vid_frames_loc = self.feat_dir/self.split/f'vid_{vid_id}.joblib'
        txt_loc = self.feat_dir/self.split/f'txt_{vid_id}.joblib'
        st,end = self.labels[vid_id][ind]
        vid = joblib.load(vid_frames_loc)
        try:
            txt = joblib.load(txt_loc)[ind]
        except:
            import pdb;pdb.set_trace()
        #normalize data
        #import pdb;pdb.set_trace()
        vid = vid/(LA.norm(vid,axis=-1)).reshape(500,1)
        txt = (txt/LA.norm(txt))
        out = np.squeeze(vid@txt.reshape(512,1))
        #regression outputs
        return out,st,end

In [41]:
# csvlogger = CSVLogger('/common/home/vk405/Projects/Crossmdl/nbs')
# trainer = pl.Trainer(
#     deterministic=True,log_every_n_steps =  1,
#     max_epochs= 10
#         )
# #trainer.fit(model, trnl,vall)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(

  | Name   | Type       | Params
--------------------------------------
0 | shared | Sequential | 125 K 
1 | start  | Linear     | 251   
2 | end    | Linear     | 251   
--------------------------------------
125 K     Trainable params
0         Non-trainable params
125 K     Total params
0.503     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


Validation sanity check:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


                                                                      

  rank_zero_warn(


Epoch 0:   0%|          | 0/144 [00:00<?, ?it/s] > [0;32m<ipython-input-39-d214a13063c5>[0m(20)[0;36mtraining_step[0;34m()[0m
[0;32m     18 [0;31m        [0;31m#loss_end = nn.CrossEntropyLoss()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m[0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m        [0mst_l[0m [0;34m=[0m [0mloss_st[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mstart[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mshared[0m[0;34m([0m[0minput[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mfloat[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0mst[0m[0;34m.[0m[0mfloat[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m        [0;31m#end_l = loss_end(self.end(input),end)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;3

BdbQuit: 

In [38]:
# #inference
# class Inference():
#     def __init__(self,data_dir,feat_dir,split):
#         self.data_dir = data_dir
#         self.feat_dir = feat_dir
#         self.split = split
        
#     def load(self,data):
#         vid_id,ind = data
#         vid_frames_loc = self.feat_dir/self.split/f'vid_{vid_id}.joblib'
#         txt_loc = self.feat_dir/self.split/f'txt_{vid_id}.joblib'
#         st,end = self.labels[vid_id][ind]
#         vid = joblib.load(vid_frames_loc)
#         try:
#             txt = joblib.load(txt_loc)[ind]
#         except:
#             import pdb;pdb.set_trace()
#         #normalize data
#         #import pdb;pdb.set_trace()
#         vid = vid/(LA.norm(vid,axis=-1)).reshape(500,1)
#         txt = (txt/LA.norm(txt))
#         out = np.squeeze(vid@txt.reshape(512,1))
#         #regression outputs
#         return out,st,end

In [None]:
#clip.load(#)

In [7]:
# annotns_file='/common/home/vk405/Projects/Crossmdl/Data/YouCookII/annotations/youcookii_annotations_trainval.json'
# data_dir = '/common/users/vk405/Youcook/'
# model_name = 'ViT-B/32'
# missing = ['ukfCQQpZ0k4', 'NK2xHVWojgY', 'mixdagZ-fwI']
# #useful_vids = missing
# from tqdm import tqdm
# model,preprocess = clip.load(model_name)
# model.eval().cuda()
# error_cnt = {}

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.74 GiB total capacity; 75.01 MiB already allocated; 15.69 MiB free; 82.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:

# vid_locs,_,sents,_ = get_split_files(split,annotns_file,data_dir)
# useful_vids  = [data_dir+ele+'/' for ele in missing]
# for vidloc in tqdm(useful_vids):
#     vid_id = vidloc.split('/')[-2]
#     save_loc_vid = store_dir+split+'/'+f'vid_{vid_id}.joblib'
#     save_loc_text = store_dir+split+'/'+f'txt_{vid_id}.joblib'
#     print(save_loc_text)
#     print(os.path.exists(save_loc_vid))
#     #import pdb;pdb.set_trace()
#     if not os.path.exists(save_loc_vid):
#         text_tokens = clip.tokenize(sents[vid_id]).cuda()

#         # with torch.no_grad():
#         #     text_features = model.encode_text(text_tokens).float()
#         #     joblib.dump(text_features.detach().cpu().numpy(),save_loc_text)
#         files = sorted(os.listdir(vidloc),key=lambda x:int(x.split('_')[0]))
#         if len(files) == 500:
#             imgs  = []
#             cnt = 0
#             for file in files:
#                 try:
#                     im = Image.open(vidloc+file)
#                     imgs.append(preprocess(im))
#                 except:
#                     # hoping here it wont be the first one
#                     cnt += 1
#                     imgs.append(imgs[-1])
#                     #import pdb;pdb.set_trace()
#             error_cnt[vid_id]  = cnt
                
#             image_input = torch.tensor(np.stack(imgs)).cuda()
#             #import pdb;pdb.set_trace()
#             im_emb = []
#             with torch.no_grad():
#                 # else can throw memory error
#                 text_features = model.encode_text(text_tokens).float()
#                 joblib.dump(text_features.detach().cpu().numpy(),save_loc_text)
#                 out1 = model.encode_image(image_input[:250]).float()
#                 out2 = model.encode_image(image_input[250:]).float()
#                 im_emb = torch.concat([out1,out2],dim=0)
#                 joblib.dump(im_emb.detach().cpu().numpy(),save_loc_vid)
#                 print(error_cnt)

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 15.74 GiB total capacity; 203.54 MiB already allocated; 4.69 MiB free; 224.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF