In [None]:
import os, sys
# pretend to be in root dir
os.chdir("..")
# add VideoLLaMA to path (for imports in VideoLLaMA)
sys.path.append('VideoLLaMA')

import torch
import numpy as np
from tqdm import tqdm
from models.VideoLlamaAL import VideoLlamaAL
from models.language_decoder_utils import load_qformer, save_qformer, get_imagebind_embeds
from torch.utils.data import DataLoader
from dataset.ImagebindEmbedsDataset import ImagebindEmbedsDataset

device = torch.device("cuda")

model = VideoLlamaAL()
model = model.to(device)

# load AL checkpoint
ckpt = torch.load("../Video-LLaMA-2-7B-Finetuned/AL_LLaMA_2_7B_Finetuned.pth", map_location="cpu")
model.load_state_dict(ckpt['model'], strict=False)

print(model.generate_text_only("Hello?"))

In [None]:
load_qformer(model, "model_saves/language_decoder/ckpts/19.pth")

## Training on sensor stuff

In [None]:
from torch import optim

batch_size = 1
num_iters = 0
num_iters_to_eval = 50
optimizer = optim.Adam(model.parameters(), lr=2e-6) # VideoLLaMA uses 1e-5 with batch size of 4, 1e-5/4 = 2.5e-6
for i in range(20):
    dataloader = DataLoader(ImagebindEmbedsDataset("data/sensor_embeddings_random_8_new", "data/actionsense_processed", "data/train_random_8.csv"), batch_size, shuffle=True)
    total_loss = 0

    pbar = tqdm(dataloader)
    for imagebind_embeds, captions in pbar:
        imagebind_embeds = imagebind_embeds.to(device)
        # imagebind_embeds += torch.normal(0, 0.01, size=imagebind_embeds.shape, device=device)
        optimizer.zero_grad()
        loss = model(imagebind_embeds, captions)['loss']
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        num_iters += 1
        if num_iters % num_iters_to_eval == 0:
            pbar.set_description(f"Loss: {total_loss / num_iters_to_eval}, {model.generate(imagebind_embeds)}")
            total_loss = 0
    
    save_qformer(model, f"model_saves/language_decoder/nonoise_ckpts/{i}.pth")

## Training on Llava instruct dataset

In [None]:
from torch import optim
from dataset.LlavaDataset import LlavaDataset

batch_size = 1
num_iters = 0
for i in range(1):
    dataloader = DataLoader(LlavaDataset("data/llava_instruct_150k.json", "data/train2017_embeds"), batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=3e-6)
    total_loss = 0

    pbar = tqdm(dataloader)
    for imagebind_embeds, prompts, targets in pbar:
        imagebind_embeds = imagebind_embeds.to(device)
        optimizer.zero_grad()
        loss = model(imagebind_embeds, targets, prompt=prompts[0], num_patch_tokens=1)['loss']
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        num_iters += 1

        num_iters_to_eval = 50
        if num_iters % num_iters_to_eval == 0:
            pbar.set_description(f"Loss: {total_loss / num_iters_to_eval}")
            total_loss = 0

## Evaluation

In [None]:
import json

with open("evaluation/eval_prompts.json", "r") as file:
    json_data = json.load(file)

for entry in json_data:
    subject = entry["subject"]
    start = entry["start"]
    prompt = entry["prompt"]
    print(subject, start, prompt)
    
    """
    print(model.generate(
        get_imagebind_embeds(f"data/sensor_embeddings_random_8/{subject}", start, start+7), 
        prompt=f"<s>###\nUser:\nOpen your eyes and imagine you see: <ImageHere><ImageHere><ImageHere><ImageHere><ImageHere><ImageHere><ImageHere><ImageHere>. {prompt}\n###\nAssistant:\n"
    ))
    """

    print(model.generate(
        get_imagebind_embeds(f"data/sensor_embeddings_random_8/{subject}", start, start+0),
        prompt=f"<s>###\nUser:\nOpen your eyes and imagine you see: <ImageHere>. {prompt}\n###\nAssistant:\n",
        num_patch_tokens=1,
    ))
    print("=" * 50)

In [None]:
print(model.generate(
    get_imagebind_embeds(f"data/imagebind_targets_dark/S09", 1495, 1495),
    prompt=f"<s>###\nUser:\nOpen your eyes and imagine you see: <ImageHere>. Describe what is happening in this scene.\n###\nAssistant:\n",
    num_patch_tokens=1,
))

In [None]:
from itertools import islice
from evaluation.caption_metrics import calculate_metrics_new

def generate_eval_sample(dataset, prompt_type="regular", prompt=None, num=100):
    outputs = []
    captions_list = []
    for imagebind_embeds, captions in tqdm(islice(DataLoader(dataset, batch_size=1, shuffle=True), num)):
        imagebind_embeds = imagebind_embeds.to(device)
        outputs.append(model.generate(imagebind_embeds, prompt_type=prompt_type, prompt=prompt))
        # outputs.append(model.generate(imagebind_embeds[:,0:1,:], prompt_type=prompt_type, num_patch_tokens=1))
        captions_list.append(captions[0])

    outputs = [output.replace("<s> ", "").replace("</s>", "") for output in outputs]
    captions_list = [caption.replace("</s>", "") for caption in captions_list]
    return outputs, captions_list

# load_qformer(model, "model_saves/language_decoder/ckpts/17.pth")
load_qformer(model, "model_saves/language_decoder/videollama.pth")

# outputs, captions_list = generate_eval_sample(ImagebindEmbedsDataset("data/sensor_embeddings_random_8_new", "data/actionsense_processed", "data/test_random_8.csv"), num=len(ImagebindEmbedsDataset("data/sensor_embeddings_random_8_new", "data/actionsense_processed", "data/test_random_8.csv")))
outputs, captions_list = generate_eval_sample(ImagebindEmbedsDataset("data/imagebind_targets", "data/actionsense_processed", "data/test_random_8.csv"), num=len(ImagebindEmbedsDataset("data/imagebind_targets", "data/actionsense_processed", "data/test_random_8.csv")))
print(outputs[0])
calculate_metrics_new(outputs, captions_list)

In [None]:
random_8_dataset = ImagebindEmbedsDataset("data/sensor_embeddings_random_8", "data/actionsense_processed", "data/test_random_8.csv")
subject_split_dataset = ImagebindEmbedsDataset("data/sensor_embeddings", "data/actionsense_processed", "data/test.csv")

models_to_evaluate = [
    # ("regular", "model_saves/language_decoder/AL_26048stage1_listlabels_random_8.pth", random_8_dataset, "list labels"),
    # ("regular 2", "model_saves/language_decoder/AL_6512stage1_random_8.pth", random_8_dataset, "regular"),

    # ("unseen users", "model_saves/language_decoder/AL11_6826stage1_noise.pth", subject_split_dataset, "regular"),

    # ("w/o noise", "model_saves/language_decoder/AL10_6826stage1.pth", subject_split_dataset, "regular"),
    # must do special run with 1 patch token only
    # ("w/o temporal embeddings", "model_saves/language_decoder/AL13_6826stage1_onepatch.pth", subject_split_dataset, "regular"),

    # ("muscle only", "model_saves/language_decoder/AL_6826stage1_muscle.pth", ImagebindEmbedsDataset("data/sensor_embeddings_muscle", "data/actionsense_processed", "data/test.csv"), "regular"),
    # ("body only", "model_saves/language_decoder/AL_6826stage1_body.pth", ImagebindEmbedsDataset("data/sensor_embeddings_body", "data/actionsense_processed", "data/test.csv"), "regular"),

    # ("VideoLLaMA", "model_saves/language_decoder/videollama_6826stage1.pth", ImagebindEmbedsDataset("data/imagebind_targets", "data/actionsense_processed", "data/test.csv"), "regular")
]

for name, qformer_path, dataset, prompt_type in models_to_evaluate:
    print("=" * 50)
    print(name)
    load_qformer(model, qformer_path)
    outputs, captions_list = generate_eval_sample(dataset, prompt_type)
    print(outputs[0])
    calculate_metrics_new(outputs, captions_list)

#### Non-finetuned VideoLLaMA eval

In [None]:
from evaluation.caption_metrics import calculate_metrics_new
from dataset.ImagebindEmbedsDataset import reworded_captions

activity_labels = list(caption.replace("</s>", "") for caption in reworded_captions.values())
vanilla_prompt = f"<s>[INST] <<SYS>>\nYou are a helpful assistant that can understand videos. The user will give you a video and ask you for a caption. You should choose one of the following captions to output:\n"
for label in activity_labels:
    vanilla_prompt += f"- {label}\n"
vanilla_prompt += f"You should output a single caption and nothing more. You don't need to explain.\n<</SYS>>\n\nOpen your eyes and imagine you see: {'<ImageHere>' * 8}. Give me a caption. Only output the caption; do not output anything else. [/INST]\n"
print(vanilla_prompt)

outputs, captions_list = generate_eval_sample(ImagebindEmbedsDataset("data/imagebind_targets", "data/actionsense_processed", "data/test_random_8_old.csv"), prompt=vanilla_prompt, num=20)
outputs = [output.replace("<s>","").replace("\n","") for output in outputs]
outputs = [" ".join(output.split(" ")[:30]) for output in outputs] # if response is too long, metrics will error; choose first 30 words
print(outputs[0:5])
calculate_metrics_new(outputs, captions_list)

## Saving

In [None]:
save_qformer(model, "model_saves/language_decoder/regular.pth")

## Confusion Matrix

In [None]:
from itertools import islice
from pprint import pprint
import csv
from dataset.ImagebindEmbedsDataset import reworded_captions

outputs = []
captions_list = []
# for imagebind_embeds, captions in tqdm(islice(DataLoader(dataset_test, batch_size=1, shuffle=True), 5)):
for imagebind_embeds, captions in tqdm(DataLoader(dataset_test, batch_size=1, shuffle=True)):
    imagebind_embeds = imagebind_embeds.to(device)
    outputs.append(model.generate(imagebind_embeds).replace("<s> ", "").replace("</s>", ""))
    captions_list.append(captions[0].replace("<s> ", "").replace("</s>", ""))

In [None]:
for x in islice(zip(outputs, captions_list), 5):
    print(x)
print("Accuracy: ", sum(output == caption for output, caption in zip(outputs, captions_list)))

recognized_captions = list(caption.replace("</s>", "") for caption in reworded_captions.values()) + ["other"]
adjusted_outputs = [(output if output in recognized_captions else "other") for output in outputs]
print(adjusted_outputs.count("other"))
confusion_matrix = [[0 for i in range(len(recognized_captions))] for j in range(len(recognized_captions))]
for output, caption in zip(adjusted_outputs, captions_list):
    confusion_matrix[recognized_captions.index(caption)][recognized_captions.index(output)] += 1

pprint(confusion_matrix)

with open('language_decoder/eval_results/confusion_matrix_big.csv', 'w', newline='') as file:
   csv.writer(file).writerows(confusion_matrix)

In [None]:
caption_groups = [
    [
        'A person is cleaning a pan with a sponge.',
        'A person is cleaning a pan with a towel.',
        'A person is cleaning a plate with a sponge.',
        'A person is cleaning a plate with a towel.',
        'A person is clearing the cutting board.',
    ],
    [
        'A person is getting plates, bowls, mugs, glasses, and utensils from the cabinets.',
        'A person is retrieving items from the refrigerator, cabinets, and drawers.',
        'A person is replacing items in the refrigerator, cabinets, and drawers.',
        'A person is loading the dishwasher with plates, bowls, mugs, glasses, and utensils.',
        'A person is unloading the plates, bowls, mugs, glasses, and utensils from the dishwasher.',
    ],
    [
        'A person is putting plates and bowls onto the table.',
        'A person is setting the table with plates, bowls, mugs, glasses, and utensils.',
    ],
    [
        'A person is opening a jar of almond butter.',
        'A person is opening and closing a jar of almond butter.',
    ],
    [
        'A person is peeling a cucumber.',
        'A person is peeling a potato.',
    ],
    [
        'A person is slicing a cucumber.',
        'A person is slicing a potato.',
        'A person is slicing the bread.',
    ],
    [
        'A person is spreading almond butter on a bread slice.',
        'A person is spreading jelly on a bread slice.',
    ],
    [
        'A person is pouring water from a pitcher into a glass.',
    ],
    [
        'other'
    ],
]

In [None]:
def find_group(x):
    for i in range(len(caption_groups)):
        if x in caption_groups[i]:
            return i

group_confusion_matrix = [[0 for i in range(len(caption_groups))] for j in range(len(caption_groups))]
for output, caption in zip(adjusted_outputs, captions_list):
    group_confusion_matrix[find_group(caption)][find_group(output)] += 1
pprint(group_confusion_matrix)

with open('language_decoder/eval_results/group_confusion_matrix_big.csv', 'w', newline='') as file:
   csv.writer(file).writerows(group_confusion_matrix)