In [26]:
import pandas as pd
import numpy as np

df1 = pd.read_csv("train_split_1.csv")
df2 = pd.read_csv("train_split_2.csv")
df3 = pd.read_csv("train_split_3.csv")
df4 = pd.read_csv("train_split_4.csv")
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df["video_id"] = df["video_id"].astype(np.int64)
train = pd.read_csv("train_with_captions.csv")
val = pd.read_csv("val_with_captions.csv")
test = pd.read_csv("test_with_captions.csv")

In [27]:
df_train = pd.merge(train, df, on='video_id')
df_val = pd.merge(val, df, on='video_id')
df_test = pd.merge(test, df, on='video_id')
len(train), len(df_train), len(val), len(df_val), len(test), len(df_test)

(34132, 34132, 4996, 4996, 8564, 8564)

In [28]:
df_train.to_csv("train_with_captions_actions.csv", index=False)
df_val.to_csv("val_with_captions_actions.csv", index=False)
df_test.to_csv("test_with_captions_actions.csv", index=False)

In [30]:
# sub_train = train[["video_id", "video_path", "caption"]]
df_train.columns

Index(['video_id', 'frame_count', 'width', 'height', 'question', 'answer',
       'qid', 'type', 'a0', 'a1', 'a2', 'a3', 'a4', 'a0_cand0', 'a0_cand1',
       'a0_cand2', 'a0_cand3', 'a0_cand4', 'a1_cand0', 'a1_cand1', 'a1_cand2',
       'a1_cand3', 'a1_cand4', 'a2_cand0', 'a2_cand1', 'a2_cand2', 'a2_cand3',
       'a2_cand4', 'a3_cand0', 'a3_cand1', 'a3_cand2', 'a3_cand3', 'a3_cand4',
       'a4_cand0', 'a4_cand1', 'a4_cand2', 'a4_cand3', 'a4_cand4', 'caption',
       'caption_confidence', 'video_path', 'result0', 'conf0', 'result1',
       'conf1', 'result2', 'conf2', 'result3', 'conf3', 'result4', 'conf4'],
      dtype='object')

In [15]:
import pandas as pd

train = pd.read_csv("train_with_captions_actions.csv")
val = pd.read_csv("val_with_captions_actions.csv")
test = pd.read_csv("test_with_captions_actions.csv")

train = train.drop_duplicates(subset=['video_id'], keep='first')
val = val.drop_duplicates(subset=['video_id'], keep='first')
test = test.drop_duplicates(subset=['video_id'], keep='first')

sub_train = train[["video_id", 'video_path', "caption", "result1"]]
sub_val = val[["video_id", 'video_path', "caption", "result1"]]
sub_test = test[["video_id", 'video_path', "caption", "result1"]]

sub_train["action_caption"] = sub_train["result1"]+". "+sub_train["caption"]
sub_val["action_caption"] = sub_val["result1"]+". "+sub_val["caption"]
sub_test["action_caption"] = sub_test["result1"]+". "+sub_test["caption"]

sub_train.to_csv("subtrain_with_captions_actions.csv", index=False)
sub_val.to_csv("subval_with_captions_actions.csv", index=False)
sub_test.to_csv("subtest_with_captions_actions.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_train["action_caption"] = sub_train["result1"]+". "+sub_train["caption"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_val["action_caption"] = sub_val["result1"]+". "+sub_val["caption"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_test["action_caption"] = sub_test["result1"]+". "+sub

In [16]:
len(sub_train), len(sub_val), len(sub_test)

(3870, 570, 1000)

In [5]:
import pandas as pd
import torch
from torch.utils import data
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel

class TextLoader(data.Dataset):
    def __init__(
        self,
        path: str
    ):
        self.path = path
        self.csv = pd.read_csv(self.path)
        self.num_answers = 5
        self.action_key = 'result0'
        self.description_key = 'caption'
        self.target_length = 60
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, index):
        example = self.csv.iloc[index]

        question = example["question"]

        answers = []

        for i in range(self.num_answers):
            answers.append(example[f'a{i}'])
        
        caption = example[self.description_key]
        action = example[self.action_key]

        text_reps = []
        for answer in answers:
            text_rep = [action, caption, question, answer]
            text_rep_test = " ".join(text_rep)
            text_rep_test = text_rep_test.split(" ")
            text_rep = " [SEP] ".join(text_rep)
            text_rep = "[CLS]" + " " + text_rep

            inputs = self.tokenizer(text_rep, return_tensors="pt")

            curr_tokens = inputs.input_ids.size()[-1]
            
            if  curr_tokens < self.target_length:
                diff = self.target_length - curr_tokens

                pad = ["[PAD]"]
                padding = " ".join(pad * diff)
                text_rep = text_rep + " " + padding
                text_reps.append(text_rep)
        
        return len(text_rep_test)

In [7]:
dataset = TextLoader('val_with_captions_actions.csv') 
loader = data.DataLoader(dataset, shuffle=False)

_max = 0
for i, batch in enumerate(tqdm(loader)):
    _max = max(_max, batch.item())
print(_max)

100%|██████████| 4996/4996 [00:13<00:00, 368.41it/s]

48





In [8]:
from tqdm import tqdm
import numpy as np

dataset = TextLoader('train_with_captions_actions.csv') 
loader = data.DataLoader(dataset, shuffle=False)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased") # device_map="auto", max_memory=max_memory_mapping
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

feat1 = []
feat2 = []

for i, batch in enumerate(tqdm(loader)):
    batch = map(lambda x: x[0], batch)
    batch = list(batch)
    inputs = tokenizer(batch, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model(**inputs)
    cls_token = outputs.pooler_output
    cls_token = cls_token.unsqueeze(0).detach().cpu()
    state = outputs.last_hidden_state
    state = state.unsqueeze(0).detach().cpu()
    if not len(feat1):
        feat1 = cls_token
    else:
        feat1 = torch.cat([feat1, cls_token], dim=0)
    if not len(feat2):
        feat2 = state
    else:
        feat2 = torch.cat([feat2, state], dim=0)
    del inputs, state, cls_token
print(feat1.shape)
print(feat2.shape)
torch.save(feat1, "bert_cls_feats.pt")
torch.save(feat2, "bert_all_feats.pt")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|▏         | 731/34132 [00:37<28:40, 19.41it/s]


KeyboardInterrupt: 

: 

In [2]:
import torch
import pandas as pd
train, val, test = pd.read_csv("/mnt/adithya/moment_detr/csvs/train_with_captions_actions.csv"), pd.read_csv("/mnt/adithya/moment_detr/csvs/val_with_captions_actions.csv"), pd.read_csv("/mnt/adithya/moment_detr/csvs/test_with_captions_actions.csv")
train_ids = torch.from_numpy(train["video_id"].to_numpy())
val_ids = torch.from_numpy(val["video_id"].to_numpy())
test_ids = torch.from_numpy(test["video_id"].to_numpy())
torch.save(train_ids, "/mnt/adithya/action_caption_dataset/bert_feats/train_video_ids.pt")
torch.save(val_ids, "/mnt/adithya/action_caption_dataset/bert_feats/val_video_ids.pt")
torch.save(test_ids, "/mnt/adithya/action_caption_dataset/bert_feats/test_video_ids.pt")
print(train_ids.shape, val_ids.shape, test_ids.shape)

torch.Size([34132]) torch.Size([4996]) torch.Size([8564])
