In [1]:
from modeling.VidCLIP import VidCLIP
from easydict import EasyDict as edict
import torch

In [None]:
import json

def extract_unique_samples(input_file_path, output_file_path):
    """
    Reads from an input JSONL file, extracts unique samples based on the description,
    and writes these unique samples to an output JSONL file.
    
    :param input_file_path: Path to the input JSONL file.
    :param output_file_path: Path where the output JSONL file will be saved.
    """
    unique_descriptions = {}
    
    # Read from the input JSONL file
    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            item = json.loads(line)
            if item['text'] not in unique_descriptions:
                unique_descriptions[item['text']] = item['clip_id']
    
    # Write to the output JSONL file
    with open(output_file_path, 'w') as output_file:
        for text, clip_id in unique_descriptions.items():
            unique_sample = json.dumps({'clip_id': clip_id, 'text': text})
            output_file.write(unique_sample + '\n')

# Example usage:
input_file_path = '/home/bas06400/Thesis/VIP/src/developmentANDtest/annotations_rgb_comp_CV_testing_120set.jsonl'  # You need to replace this with the actual input file path
output_file_path = 'unique_samples120test.jsonl'  # You need to replace this with the desired output file path

# Call the function with the paths to your files
extract_unique_samples(input_file_path, output_file_path)


In [2]:
# Create an 'args' object from the provided JSON structure
args = edict({
    "clip_config": "openai/clip-vit-base-patch16",
    "clip_weights": "openai/clip-vit-base-patch16",
    "clip_vision_additional_config": edict({
        "type": "ViP",
        "temporal_size": 12,
        "if_use_temporal_embed": True,
        "logit_scale_init_value": 4.60,
        "add_cls_num": 3
    }),
    "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint"
})

# Initialize the model instance
model_instance = VidCLIP(args)
print(model_instance)

Some weights of CLIPModel were not initialized from the model checkpoint at openai/clip-vit-base-patch16 and are newly initialized: ['vision_model.embeddings.temporal_embedding', 'vision_model.embeddings.added_cls']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VidCLIP(
  (clipmodel): CLIPModel(
    (text_model): CLIPTextTransformer(
      (embeddings): CLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 512)
        (position_embedding): Embedding(77, 512)
      )
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=512, out_features=2048, bias=True)
              (fc2): Linear(in_features=2048, out_features=512, bias=True)
   

In [3]:
ckpt = torch.load('/home/bas06400/Thesis/pretrain_clipvip_base_16.pt')
model_instance.load_state_dict(ckpt)

<All keys matched successfully>

In [4]:
from modeling.CLIP_ViP import CLIPVisionModel, CLIPVisionTransformer, CLIPTextModel, CLIPTextTransformer
from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
from transformers import CLIPPreTrainedModel
from torch import nn


# Given json_config
json_config = {
    # ... (rest of your JSON config)
    "additional_vision_config": {
        "type": "ViP",
        "temporal_size": 12,
        "if_use_temporal_embed": 1,
        "logit_scale_init_value": 4.60,
        "add_cls_num": 3,
        "hiiden_size": 12
    },
    # ... (rest of your JSON config)
}
class SimpleNamespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    
# Load the base CLIPConfig
clipconfig = CLIPVisionConfig.from_pretrained("openai/clip-vit-base-patch16")

additional_vision_config_obj = SimpleNamespace(**json_config["additional_vision_config"])
setattr(clipconfig, "additional_vision_config", additional_vision_config_obj)
class CLIPVisionModel(CLIPPreTrainedModel):
    config_class = CLIPVisionConfig
    main_input_name = "pixel_values"

    def __init__(self, config: CLIPVisionConfig):
        super().__init__(config)
        # Pass the additional_vision_config to CLIPVisionTransformer
        self.vision_model = CLIPVisionTransformer(config, config.additional_vision_config)
        # Add the visual projection layer
        self.visual_projection = nn.Linear(768, 512, bias=False)
        # Initialize weights and apply final processing
        self.post_init()

    def forward(self, pixel_values, output_attentions=None, output_hidden_states=None, return_dict=None):
        # Get the output from the vision_model
        vision_output = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        pooled_output = vision_output[1]  # pooled_output
        image_features = self.visual_projection(pooled_output)
        
        return image_features

model = CLIPVisionModel(clipconfig)
model

CLIPVisionModel(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionViPEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
      (position_embedding): Embedding(197, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
    

In [5]:
# Assuming vidclip_model is the instance of your VidCLIP model
vidclip_model_weights = model_instance.state_dict()



# Prepare a dictionary to hold the relevant weights
state_dict = {}

# Copy weights from the vidclip_model_weights to state_dict
for name, param in vidclip_model_weights.items():
    if "vision_model" in name:
        new_name = name.replace("clipmodel.", "")  # remove the prefix
        state_dict[new_name] = param
    if "visual_projection" in name:
        new_name = name.replace("clipmodel.", "")  # remove the prefix
        state_dict[new_name] = param

# Load the state_dict into clip_vision_model
model.load_state_dict(state_dict)

for param in model.parameters():
    param.requires_grad_(False)

In [6]:
from multimodal_dataset import MultiModalVideoDataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader

In [7]:
import torch
from torch.utils.data import random_split
import random

# Set the seed for reproducibility
seed = 42
random.seed(seed)  # Seed for Python's random module
torch.manual_seed(seed)  # Seed for PyTorch random number generators


data_root = '/net/polaris/storage/deeplearning/ntu'
data_list = '/home/bas06400/Thesis/rgb_ir_dataset.txt'
data = MultiModalVideoDataset(data_list, data_root, ['rgb','ir'], use_advanced_processing=True)

print(data[0][0]['rgb'].shape, data[0][0]['ir'].shape, data[0][1])

# Calculate lengths of splits
total_len = len(data)
train_len = int(0.8 * total_len)
val_len = int(0.1 * total_len)
test_len = total_len - train_len - val_len

# Split the dataset
train_data, val_data, test_data = random_split(data, [train_len, val_len, test_len])


torch.Size([12, 3, 224, 224]) torch.Size([12, 1, 224, 224]) 1


In [8]:
def custom_collate_fn(batch):
    """
    Custom collate function to handle batches of data from MultiModalVideoDataset.
    
    Args:
    - batch (list): List of samples fetched from `MultiModalVideoDataset`.
    
    Returns:
    - collated_data (dict): Collated data for each modality.
    - collated_labels (tensor): Collated labels.
    """
    collated_data = {}
    collated_labels = []
    collated_index = []
    
    # Initialize empty lists for each modality in the first sample
    for modality in batch[0][0].keys():
        collated_data[modality] = []
    
    for data, label, idx in batch:
        collated_labels.append(label-1)
        for modality, frames in data.items():
            collated_data[modality].append(frames)
        collated_index.append(idx)
    # Convert lists to tensors for each modality
    for modality, frames_list in collated_data.items():
        collated_data[modality] = torch.stack(frames_list)
    
    collated_labels = torch.tensor(collated_labels)
    
    return collated_data, collated_labels, collated_index


# Create a DataLoader
batch_size = 8
shuffle = True
num_workers = 10
pin_memory = True

# Create a DataLoader for the training set
train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)

# Create a DataLoader for the validation set
val_loader = DataLoader(
    val_data,
    batch_size=batch_size,  
    shuffle=False,  
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)

# Create a DataLoader for the test set
test_loader = DataLoader(
    test_data,
    batch_size=batch_size,  
    shuffle=False,  
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)
"""
for batch_data, batch_labels in test_loader:
    
    print(batch_data['rgb'].shape,batch_data['ir'].shape)
    break
"""

"\nfor batch_data, batch_labels in test_loader:\n    \n    print(batch_data['rgb'].shape,batch_data['ir'].shape)\n    break\n"

In [9]:
ir_model = CLIPVisionModel(clipconfig)
# Load the state_dict into clip_vision_model
ir_model.load_state_dict(state_dict)

ir_model.vision_model.embeddings.patch_embedding = nn.Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
"""
# Now, average the weights across the RGB channels
old_weights = state_dict['vision_model.embeddings.patch_embedding.weight']
new_weights = old_weights.mean(dim=1, keepdim=True)

# Assign the averaged weights to the new patch_embedding layer
ir_model.vision_model.embeddings.patch_embedding.weight.data = new_weights
"""

state_dict = torch.load("/home/bas06400/Thesis/best_ir_encoder.pth")
state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
ir_model.load_state_dict(state_dict)

ir_model


CLIPVisionModel(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionViPEmbeddings(
      (patch_embedding): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
      (position_embedding): Embedding(197, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
    

In [10]:
"""
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

model = torch.nn.DataParallel(model, device_ids=[0,1,2, 3]).cuda()  # Assuming GPUs 3 and 4 are available
ir_model = torch.nn.DataParallel(ir_model, device_ids=[0,1, 2, 3]).cuda()
# Hyperparameters
learning_rate = 0.001
num_epochs = 20
temperature = 0.07  # Temperature parameter for InfoNCE loss

# Initialize the optimizer
optimizer = optim.Adam(ir_model.parameters(), lr=learning_rate)

# InfoNCE Loss function
def info_nce_loss(emb1, emb2, temperature=0.07):
    # Compute similarity matrix
    sim_matrix = torch.mm(emb1, emb2.t())
    # Scale similarity by temperature
    sim_matrix = sim_matrix / temperature
    # Calculate loss
    loss = F.cross_entropy(sim_matrix, torch.arange(sim_matrix.size(0)).to(emb1.device))
    return loss

# Placeholder for best validation loss
best_val_loss = float('inf')

# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0.0
    model.train()
    ir_model.train()
    # Wrap dataloader with tqdm for progress bar
    for batch_data, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move data to GPU
        rgb_data = batch_data['rgb'].cuda() #.to('cuda:3')
        ir_data = batch_data['ir'].cuda() #.to('cuda:3')

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass: Get embeddings or representations from model
        rgb_emb  = model(rgb_data)
        ir_emb  = ir_model(ir_data)

        # Compute the contrastive loss
        loss = info_nce_loss(rgb_emb, ir_emb, temperature)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        epoch_loss += loss.item()
        #break
    print(f"Epoch [{epoch+1}/{num_epochs}], Avg Loss: {epoch_loss / len(train_loader):.4f}")
    # Validation loop
    model.eval()
    ir_model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for batch_data, _ in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
            rgb_data = batch_data['rgb'].cuda() #.to('cuda:3')
            ir_data = batch_data['ir'].cuda() #.to('cuda:3')
            rgb_emb  = model(rgb_data)
            ir_emb  = ir_model(ir_data)
            loss = info_nce_loss(rgb_emb, ir_emb, temperature)
            val_loss += loss.item()
            #break
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss:.4f}")
        
        # Save the best model (optional)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_ir_encoder.pth')

print("Training complete!")
"""

'\nimport torch.nn.functional as F\nimport torch.optim as optim\nfrom tqdm import tqdm\n\nmodel = torch.nn.DataParallel(model, device_ids=[0,1,2, 3]).cuda()  # Assuming GPUs 3 and 4 are available\nir_model = torch.nn.DataParallel(ir_model, device_ids=[0,1, 2, 3]).cuda()\n# Hyperparameters\nlearning_rate = 0.001\nnum_epochs = 20\ntemperature = 0.07  # Temperature parameter for InfoNCE loss\n\n# Initialize the optimizer\noptimizer = optim.Adam(ir_model.parameters(), lr=learning_rate)\n\n# InfoNCE Loss function\ndef info_nce_loss(emb1, emb2, temperature=0.07):\n    # Compute similarity matrix\n    sim_matrix = torch.mm(emb1, emb2.t())\n    # Scale similarity by temperature\n    sim_matrix = sim_matrix / temperature\n    # Calculate loss\n    loss = F.cross_entropy(sim_matrix, torch.arange(sim_matrix.size(0)).to(emb1.device))\n    return loss\n\n# Placeholder for best validation loss\nbest_val_loss = float(\'inf\')\n\n# Training loop\nfor epoch in range(num_epochs):\n    epoch_loss = 0.0\n

In [11]:
from typing import Any, Optional, Tuple, Union
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling

# Load the base CLIPTextConfig
clip_text_config = CLIPTextConfig.from_pretrained("openai/clip-vit-base-patch16")

class CustomCLIPTextModel(CLIPTextModel):
    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)
        # No additional text config passed here as it's not provided
        self.text_model = CLIPTextTransformer(config)
        
        self.text_projection = nn.Linear(in_features=512, out_features=512, bias=False)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        # Call the original forward method to get the model outputs
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        
        # Apply the text_projection layer to the pooled_output (assuming you want to project the pooled output)
        projected_output = self.text_projection(outputs.pooler_output)
        
        if not return_dict:
            # If not returning a dict, convert the BaseModelOutputWithPooling to a tuple,
            # append the projected_output to the tuple, and return
            outputs_tuple = (
                outputs.last_hidden_state,
                projected_output,
                outputs.hidden_states,
                outputs.attentions
            )
            return outputs_tuple
        
        # Otherwise, create a new BaseModelOutputWithPooling containing the projected_output and return
        return BaseModelOutputWithPooling(
            last_hidden_state=outputs.last_hidden_state,
            pooler_output=projected_output,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# Instantiate the text model with the loaded configuration
text_model = CustomCLIPTextModel(clip_text_config)
text_model

CustomCLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), 

In [12]:
# Prepare a dictionary to hold the relevant weights
state_dict = {}

# Copy weights from the vidclip_model_weights to state_dict
for name, param in vidclip_model_weights.items():
    if "text_model" in name:
        new_name = name.replace("clipmodel.", "")  # remove the prefix
        state_dict[new_name] = param
    if "text_projection" in name:
        new_name = name.replace("clipmodel.", "")  # remove the prefix
        state_dict[new_name] = param

# Load the state_dict into clip_vision_model
text_model.load_state_dict(state_dict)

for param in text_model.parameters():
    param.requires_grad_(False)

In [13]:
from transformers import CLIPModel, CLIPTokenizer

text_model = text_model.to('cuda:3')
clip_model_name = "openai/clip-vit-base-patch16"
tokenizer = CLIPTokenizer.from_pretrained(clip_model_name)

text_descriptions = [
    "drink water",
    "eat meal",
    "brush teeth",
    "brush hair",
    "drop",
    "pick up",
    "throw",
    "sit down",
    "stand up",
    "clapping",
    "reading",
    "writing",
    "tear up paper",
    "put on jacket",
    "take off jacket",
    "put on a shoe",
    "take off a shoe",
    "put on glasses",
    "take off glasses",
    "put on a hat/cap",
    "take off a hat/cap",
    "cheer up",
    "hand waving",
    "kicking something",
    "reach into pocket",
    "hopping",
    "jump up",
    "phone call",
    "play with phone/tablet",
    "typing on a keyboard",
    "point to something",
    "taking a selfie",
    "check time (from watch)",
    "rub two hands",
    "nod head/bow",
    "shake head",
    "wipe face",
    "salute",
    "put palms together",
    "cross hands in front",
    "sneeze/cough",
    "staggering",
    "falling down",
    "headache",
    "chest pain",
    "back pain",
    "neck pain",
    "nausea/vomiting",
    "fan self",
    "punch/slapp",
    "kicking",
    "pushing",
    "pat on back",
    "point finger",
    "hugging",
    "giving object",
    "touch pocket",
    "shaking hands",
    "walking towards",
    "walking apart"
]

# Tokenize the text descriptions
text_inputs = tokenizer(text_descriptions, return_tensors="pt", padding=True, truncation=True).to('cuda:3')

# Create dummy pixel values
batch_size = text_inputs['input_ids'].shape[0]


# Obtain text embeddings using the text model of CLIP
with torch.no_grad():
    text_outputs = text_model(input_ids=text_inputs['input_ids'])
    text_embeddings = text_outputs

print(text_embeddings[1].shape)

torch.Size([60, 512])


In [14]:

import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm


def batch_cosine_similarity(x1, x2):
    # x1 has shape (batch_size, embed_dim)
    # x2 has shape (num_text_descriptions, embed_dim)
    dot = x1 @ x2.T
    norm1 = torch.norm(x1, p=2, dim=1).unsqueeze(1)
    norm2 = torch.norm(x2, p=2, dim=1).unsqueeze(0)
    return dot / (norm1 * norm2)
print(model)
model = model.to('cuda:3')
correct_rgb_predictions = 0
total_samples = 0

with torch.no_grad():
    for batch_data, batch_labels, idx in tqdm(test_loader, desc="Evaluating", ncols=100):
        # Move data to the appropriate device
        rgb_data = batch_data['rgb'].to('cuda:3')
        batch_labels = batch_labels.to('cuda:3')
        print(idx)

        model.eval()
        # Extract embeddings from the model
        rgb_emb = model(rgb_data)
        
        # Compute cosine similarities for both RGB and IR embeddings
        similarities_rgb = batch_cosine_similarity(rgb_emb, text_embeddings[1])
        
        
        # Get predicted classes
        predicted_class_rgb = torch.argmax(similarities_rgb, dim=1)
        print(predicted_class_rgb)
        print(batch_labels)
        # Update correct predictions count
        correct_rgb_predictions += (predicted_class_rgb == batch_labels).sum().item()
        
        
        # Update total samples count
        total_samples += batch_labels.size(0)

# Compute accuracies
accuracy_rgb = correct_rgb_predictions / total_samples


print(f"RGB Accuracy: {accuracy_rgb * 100:.2f}%")


CLIPVisionModel(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionViPEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
      (position_embedding): Embedding(197, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
    

Evaluating:   0%|                                                           | 0/711 [00:00<?, ?it/s]

[23416, 42806, 5234, 31987, 53795, 39887, 51540, 10851]


Evaluating:   0%|                                                 | 1/711 [00:17<3:26:14, 17.43s/it]

tensor([53,  8,  8,  8,  8,  8,  2,  8], device='cuda:3')
tensor([16, 26, 14,  7, 35, 47,  0, 51], device='cuda:3')
[29637, 1159, 7476, 16431, 20799, 52087, 52150, 41367]


Evaluating:   0%|▏                                                | 2/711 [00:17<1:26:34,  7.33s/it]

tensor([ 8, 20, 27, 53, 53,  8,  8,  8], device='cuda:3')
tensor([57, 19, 36, 51, 39,  7, 10, 27], device='cuda:3')
[6219, 10279, 8656, 23902, 9015, 16629, 1397, 28602]


Evaluating:   0%|▏                                                  | 3/711 [00:17<48:21,  4.10s/it]

tensor([31,  8,  8, 39,  8, 53,  8,  8], device='cuda:3')
tensor([39, 19, 16, 22, 15,  9, 17, 42], device='cuda:3')
[47820, 44664, 25441, 26051, 32787, 17716, 40133, 45648]


Evaluating:   1%|▎                                                  | 4/711 [00:18<30:24,  2.58s/it]

tensor([ 8, 29, 53, 55, 31, 53, 53,  8], device='cuda:3')
tensor([ 0, 24,  1, 11, 27, 16, 53, 48], device='cuda:3')
[37058, 41083, 2651, 11199, 12412, 43992, 14099, 23274]


Evaluating:   1%|▎                                                  | 5/711 [00:18<20:30,  1.74s/it]

tensor([29,  8,  8, 53, 53, 29,  8, 54], device='cuda:3')
tensor([38, 43, 11, 39, 52, 12, 59, 54], device='cuda:3')
[49571, 3526, 40385, 13576, 39547, 1343, 7133, 8908]


Evaluating:   1%|▍                                                  | 6/711 [00:18<14:32,  1.24s/it]

tensor([29,  8,  8,  8,  8,  8, 53, 31], device='cuda:3')
tensor([11, 46,  5, 16,  7, 23, 53, 28], device='cuda:3')
[41591, 32699, 6608, 38317, 20560, 16444, 49489, 54842]


Evaluating:   1%|▌                                                  | 7/711 [00:18<10:45,  1.09it/s]

tensor([29, 29, 29, 31,  8, 55, 53, 53], device='cuda:3')
tensor([11, 59,  8, 37, 40,  4, 49,  2], device='cuda:3')
[2676, 5147, 14576, 12730, 25412, 37438, 31811, 55466]


Evaluating:   1%|▌                                                  | 8/711 [00:19<08:16,  1.42it/s]

tensor([18,  8, 53,  8, 53, 29, 29,  8], device='cuda:3')
tensor([36, 47, 56, 10, 32, 58, 11, 26], device='cuda:3')
[6440, 12557, 29337, 12734, 36598, 25210, 3755, 31339]


Evaluating:   1%|▋                                                  | 9/711 [00:19<06:37,  1.77it/s]

tensor([53, 31, 57, 14, 29, 53,  8, 19], device='cuda:3')
tensor([20, 17, 57, 14, 58, 10, 35, 19], device='cuda:3')
[2094, 11298, 18856, 53395, 5556, 32682, 9351, 3620]


Evaluating:   1%|▋                                                 | 10/711 [00:19<05:29,  2.12it/s]

tensor([54, 53, 29,  8, 53,  8, 53, 20], device='cuda:3')
tensor([54, 18, 16, 55, 36, 42, 51, 20], device='cuda:3')


In [None]:
text_descriptions60_120 = [
    "put on headphone",
    "take off headphone",
    "shoot at the basket",
    "bounce ball",
    "tennis bat swing",
    "juggling table tennis balls",
    "hush (quite)",
    "flick hair",
    "thumb up",
    "thumb down",
    "make ok sign",
    "make victory sign",
    "staple book",
    "counting money",
    "cutting nails",
    "cutting paper (using scissors)",
    "snapping fingers",
    "open bottle",
    "sniff (smell)",
    "squat down",
    "toss a coin",
    "fold paper",
    "ball up paper",
    "play magic cube",
    "apply cream on face",
    "apply cream on hand back",
    "put on bag",
    "take off bag",
    "put something into a bag",
    "take something out of a bag",
    "open a box",
    "move heavy objects",
    "shake fist",
    "throw up cap/hat",
    "hands up (both hands)",
    "cross arms",
    "arm circles",
    "arm swings",
    "running on the spot",
    "butt kicks (kick backward)",
    "cross toe touch",
    "side kick",
    "yawn",
    "stretch oneself",
    "blow nose",
    "hit other person with something",
    "wield knife towards other person",
    "knock over other person (hit with body)",
    "grab other person’s stuff",
    "shoot at other person with a gun",
    "step on foot",
    "high-five",
    "cheers and drink",
    "carry something with other person",
    "take a photo of other person",
    "follow other person",
    "whisper in other person’s ear",
    "exchange things with other person",
    "support somebody with hand",
    "finger-guessing game (playing rock-paper-scissors)"
]



In [1]:
text_descriptions = [
    "drink water",
    "eat meal",
    "brush teeth",
    "brush hair",
    "drop",
    "pick up",
    "throw",
    "sit down",
    "stand up",
    "clapping",
    "reading",
    "writing",
    "tear up paper",
    "put on jacket",
    "take off jacket",
    "put on a shoe",
    "take off a shoe",
    "put on glasses",
    "take off glasses",
    "put on a hat/cap",
    "take off a hat/cap",
    "cheer up",
    "hand waving",
    "kicking something",
    "reach into pocket",
    "hopping",
    "jump up",
    "phone call",
    "play with phone/tablet",
    "typing on a keyboard",
    "point to something",
    "taking a selfie",
    "check time (from watch)",
    "rub two hands",
    "nod head/bow",
    "shake head",
    "wipe face",
    "salute",
    "put palms together",
    "cross hands in front",
    "sneeze/cough",
    "staggering",
    "falling down",
    "headache",
    "chest pain",
    "back pain",
    "neck pain",
    "nausea/vomiting",
    "fan self",
    "punch/slapp",
    "kicking",
    "pushing",
    "pat on back",
    "point finger",
    "hugging",
    "giving object",
    "touch pocket",
    "shaking hands",
    "walking towards",
    "walking apart"
]

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [None]:
extended_descriptions120 = [
    "A person putting on headphones in a quiet study room",
    "An individual taking off headphones in a home office with a computer.",
    "Someone shooting a basketball towards a hoop in an outdoor court during sunset.",
    "A person bouncing a ball on a paved driveway with a basketball hoop in the background.",
    "An individual swinging a tennis racket at a yellow ball on a sunny tennis court.",
    "Someone juggling table tennis balls in a bright game room with a ping pong table.",
    "A person gesturing 'hush' with a finger on lips in a library filled with bookshelves.",
    "An individual flicking their long hair back in a mirror reflection in a dance studio.",
    "A person giving a thumbs up in a bright classroom with students and a chalkboard.",
    "Someone giving a thumbs down in a meeting room with a large monitor displaying data.",
    "An individual making an OK sign with their hand in a cozy cafe with coffee cups on tables.",
    "A person making a victory sign with their fingers in front of a scenic viewpoint overlooking mountains.",
    "Someone stapling pages of a book in a crafting room with art supplies on shelves.",
    "An individual counting money on a wooden table in a small business office.",
    "A person cutting nails sitting on a porch with a garden view.",
    "Someone cutting paper with scissors on a cluttered craft table in an art studio.",
    "An individual snapping fingers to music in a brightly lit kitchen while cooking.",
    "A person opening a bottle in a picnic setting with a basket and a blanket.",
    "Someone sniffing a perfume bottle in a boutique with shelves of beauty products.",
    "An individual squatting down to tie shoelaces in a gym with workout equipment.",
    "A person tossing a coin into a fountain outdoors with trees in the background.",
    "Someone folding paper into an airplane in a classroom with kids' drawings on the walls.",
    "An individual balling up paper in frustration in an office with a computer and documents.",
    "A person playing with a magic cube in a cozy living room on a soft rug.",
    "Someone applying cream on their face in a bright bathroom with a large mirror.",
    "An individual applying cream on the back of their hand in a beauty salon with products on display.",
    "A person putting on a backpack in a hostel dormitory with bunk beds and lockers.",
    "Someone taking off a backpack at the entrance of a hiking trail with woods in the background.",
    "An individual putting something into a bag on a kitchen counter with groceries around.",
    "A person taking something out of a bag in a classroom with desks and a projector.",
    "Someone opening a box in a living room on Christmas morning with decorations.",
    "An individual moving heavy objects in a garage filled with tools and storage boxes.",
    "A person shaking their fist in excitement at a sports event with a crowd cheering.",
    "Someone throwing up a cap in celebration during a graduation ceremony in an open field.",
    "An individual with hands up in surrender during a playful water fight in a backyard.",
    "A person crossing their arms while waiting in a coffee shop with a line of customers.",
    "Someone doing arm circles as part of a warm-up in a fitness class at a gym.",
    "An individual performing arm swings in a park with autumn leaves on the ground.",
    "A person running on the spot in a home gym with a treadmill and weights.",
    "Someone doing butt kicks during a track workout on a sunny day at an athletic field.",
    "An individual performing a cross toe touch in a yoga studio with mats and calming decor.",
    "A person executing a side kick in a martial arts dojo with mirrors and training pads.",
    "Someone yawning widely in a cozy bedroom early in the morning with the sun rising.",
    "An individual stretching themselves in an office during a break with a coffee cup on the desk.",
    "A person blowing their nose with a tissue in a bright, airy living room with a large window.",
    "Someone hitting another person with a foam bat in a playful outdoor party setting.",
    "An individual wielding a knife towards another person in a dramatic theater rehearsal scene.",
    "A person knocking over another person during a friendly beach volleyball game.",
    "Someone grabbing another person’s hat in a playful manner at a sunny park.",
    "An individual shooting at another person with a water gun during a summer backyard party.",
    "A person stepping on someone’s foot accidentally in a crowded subway car.",
    "Someone high-fiving in a sports team huddle on a field with goals in the background.",
    "An individual cheering and drinking with friends at a rooftop bar with city lights.",
    "Two people carrying a couch together into a new apartment with boxes around.",
    "A person taking a photo of another person in front of a famous landmark during a trip.",
    "Someone following another person in a vast open room.",
    "An individual whispering in another person’s ear during a secret exchange in a library.",
    "Two people exchanging things in an industrial building.",
    "Someone supporting somebody with a hand during a difficult hiking trail with scenic views.",
    "Two individuals playing rock-paper-scissors in a schoolyard with children watching."]

In [2]:
text_descriptions = [
    "A person drinking water from a clear glass in a kitchen.",
    "An individual eating a meal at a dining table, using a fork and knife.",
    "A person brushing teeth with a toothbrush in a bathroom mirror.",
    "Someone brushing long hair with a hairbrush in a bedroom.",
    "A person dropping a red ball onto a wooden floor in a living room.",
    "An individual picking up a blue book from the floor in a study room.",
    "A person throwing a white paper airplane in an office setting.",
    "Someone sitting down on a green armchair in a cozy room.",
    "An individual standing up from a metal chair in a cafeteria.",
    "A person clapping hands in an auditorium with a stage.",
    "Someone reading a hardcover book in a library with bookshelves.",
    "An individual writing in a notebook at a desk with a lamp.",
    "A person tearing up a sheet of paper over a trash bin in a workspace.",
    "Someone putting on a black jacket in a hallway with coat hangers.",
    "An individual taking off a red jacket in a changing room.",
    "A person putting on a white sneaker in a gym locker room.",
    "Someone taking off a brown shoe in an entryway with a shoe rack.",
    "An individual putting on eyeglasses in an office with a computer.",
    "A person taking off sunglasses in a sunlit atrium.",
    "Someone putting on a baseball cap in a sports store.",
    "An individual taking off a wool hat in a coat room.",
    "A person cheering up, smiling and laughing in a living room with a sofa.",
    "Someone waving hand in a greeting at a hotel lobby.",
    "An individual kicking a small football in an indoor play area.",
    "A person reaching into a pocket of jeans in a bedroom.",
    "Someone hopping on one foot in a fitness studio.",
    "An individual jumping up with arms raised in a dance studio.",
    "A person making a phone call on a smartphone in a home office.",
    "Someone playing with a tablet on a couch in a family room.",
    "An individual typing on a keyboard at a computer desk in a study.",
    "A person pointing to a painting on a wall in an art gallery.",
    "Someone taking a selfie with a phone in a mirror in a dressing room.",
    "An individual checking time on a wristwatch in a conference room.",
    "A person rubbing two hands together in a kitchen.",
    "Someone nodding head in agreement in a meeting room with a whiteboard.",
    "An individual shaking head in disapproval in a classroom.",
    "A person wiping face with a handkerchief in a bathroom.",
    "Someone saluting in a uniform in a military office.",
    "An individual putting palms together in a gesture of prayer in a chapel.",
    "A person crossing arms in front in a casual home setting.",
    "Someone sneezing into a tissue in a doctor's waiting room.",
    "An individual staggering in a hallway as if dizzy.",
    "A person falling down onto a carpet in a living room.",
    "Someone holding head in pain, indicating a headache, in an office.",
    "An individual clutching chest in pain in a home living area.",
    "A person holding lower back in pain in a furniture store.",
    "Someone holding neck in pain in a home study.",
    "An individual feeling nauseous, about to vomit, in a bathroom.",
    "A person fanning self with a magazine in a warm room.",
    "Someone punching the air in a boxing gym.",
    "An individual kicking a pillow in a bedroom.",
    "A person pushing a chair in a dining room.",
    "Someone patting a friend on the back in a coffee shop.",
    "An individual pointing a finger at a computer screen in an office.",
    "A person hugging a friend in a living room.",
    "Someone giving a pen to another person in an office.",
    "An individual touching the pocket of their jeans in a bedroom.",
    "Two people shaking hands in a business meeting room.",
    "A person walking towards a window in a bright room.",
    "Two individuals walking apart in a hallway of an office building."
]

print(len(text_descriptions))

60


In [None]:
text_descriptions120 = [
    "A person drinking water from a clear glass in a kitchen.",
    "An individual eating a meal at a dining table, using a fork and knife.",
    "A person brushing teeth with a toothbrush in a bathroom mirror.",
    "Someone brushing long hair with a hairbrush in a bedroom.",
    "A person dropping a red ball onto a wooden floor in a living room.",
    "An individual picking up a blue book from the floor in a study room.",
    "A person throwing a white paper airplane in an office setting.",
    "Someone sitting down on a green armchair in a cozy room.",
    "An individual standing up from a metal chair in a cafeteria.",
    "A person clapping hands in an auditorium with a stage.",
    "Someone reading a hardcover book in a library with bookshelves.",
    "An individual writing in a notebook at a desk with a lamp.",
    "A person tearing up a sheet of paper over a trash bin in a workspace.",
    "Someone putting on a black jacket in a hallway with coat hangers.",
    "An individual taking off a red jacket in a changing room.",
    "A person putting on a white sneaker in a gym locker room.",
    "Someone taking off a brown shoe in an entryway with a shoe rack.",
    "An individual putting on eyeglasses in an office with a computer.",
    "A person taking off sunglasses in a sunlit atrium.",
    "Someone putting on a baseball cap in a sports store.",
    "An individual taking off a wool hat in a coat room.",
    "A person cheering up, smiling and laughing in a living room with a sofa.",
    "Someone waving hand in a greeting at a hotel lobby.",
    "An individual kicking a small football in an indoor play area.",
    "A person reaching into a pocket of jeans in a bedroom.",
    "Someone hopping on one foot in a fitness studio.",
    "An individual jumping up with arms raised in a dance studio.",
    "A person making a phone call on a smartphone in a home office.",
    "Someone playing with a tablet on a couch in a family room.",
    "An individual typing on a keyboard at a computer desk in a study.",
    "A person pointing to a painting on a wall in an art gallery.",
    "Someone taking a selfie with a phone in a mirror in a dressing room.",
    "An individual checking time on a wristwatch in a conference room.",
    "A person rubbing two hands together in a kitchen.",
    "Someone nodding head in agreement in a meeting room with a whiteboard.",
    "An individual shaking head in disapproval in a classroom.",
    "A person wiping face with a handkerchief in a bathroom.",
    "Someone saluting in a uniform in a military office.",
    "An individual putting palms together in a gesture of prayer in a chapel.",
    "A person crossing arms in front in a casual home setting.",
    "Someone sneezing into a tissue in a doctor's waiting room.",
    "An individual staggering in a hallway as if dizzy.",
    "A person falling down onto a carpet in a living room.",
    "Someone holding head in pain, indicating a headache, in an office.",
    "An individual clutching chest in pain in a home living area.",
    "A person holding lower back in pain in a furniture store.",
    "Someone holding neck in pain in a home study.",
    "An individual feeling nauseous, about to vomit, in a bathroom.",
    "A person fanning self with a magazine in a warm room.",
    "Someone punching the air in a boxing gym.",
    "An individual kicking a pillow in a bedroom.",
    "A person pushing a chair in a dining room.",
    "Someone patting a friend on the back in a coffee shop.",
    "An individual pointing a finger at a computer screen in an office.",
    "A person hugging a friend in a living room.",
    "Someone giving a pen to another person in an office.",
    "An individual touching the pocket of their jeans in a bedroom.",
    "Two people shaking hands in a business meeting room.",
    "A person walking towards a window in a bright room.",
    "Two individuals walking apart in a hallway of an office building.",
    "A person putting on headphones in a quiet study room",
    "An individual taking off headphones in a home office with a computer.",
    "Someone shooting a basketball towards a hoop in an outdoor court during sunset.",
    "A person bouncing a ball on a paved driveway with a basketball hoop in the background.",
    "An individual swinging a tennis racket at a yellow ball on a sunny tennis court.",
    "Someone juggling table tennis balls in a bright game room with a ping pong table.",
    "A person gesturing 'hush' with a finger on lips in a library filled with bookshelves.",
    "An individual flicking their long hair back in a mirror reflection in a dance studio.",
    "A person giving a thumbs up in a bright classroom with students and a chalkboard.",
    "Someone giving a thumbs down in a meeting room with a large monitor displaying data.",
    "An individual making an OK sign with their hand in a cozy cafe with coffee cups on tables.",
    "A person making a victory sign with their fingers in front of a scenic viewpoint overlooking mountains.",
    "Someone stapling pages of a book in a crafting room with art supplies on shelves.",
    "An individual counting money on a wooden table in a small business office.",
    "A person cutting nails sitting on a porch with a garden view.",
    "Someone cutting paper with scissors on a cluttered craft table in an art studio.",
    "An individual snapping fingers to music in a brightly lit kitchen while cooking.",
    "A person opening a bottle in a picnic setting with a basket and a blanket.",
    "Someone sniffing a perfume bottle in a boutique with shelves of beauty products.",
    "An individual squatting down to tie shoelaces in a gym with workout equipment.",
    "A person tossing a coin into a fountain outdoors with trees in the background.",
    "Someone folding paper into an airplane in a classroom with kids' drawings on the walls.",
    "An individual balling up paper in frustration in an office with a computer and documents.",
    "A person playing with a magic cube in a cozy living room on a soft rug.",
    "Someone applying cream on their face in a bright bathroom with a large mirror.",
    "An individual applying cream on the back of their hand in a beauty salon with products on display.",
    "A person putting on a backpack in a hostel dormitory with bunk beds and lockers.",
    "Someone taking off a backpack at the entrance of a hiking trail with woods in the background.",
    "An individual putting something into a bag on a kitchen counter with groceries around.",
    "A person taking something out of a bag in a classroom with desks and a projector.",
    "Someone opening a box in a living room on Christmas morning with decorations.",
    "An individual moving heavy objects in a garage filled with tools and storage boxes.",
    "A person shaking their fist in excitement at a sports event with a crowd cheering.",
    "Someone throwing up a cap in celebration during a graduation ceremony in an open field.",
    "An individual with hands up in surrender during a playful water fight in a backyard.",
    "A person crossing their arms while waiting in a coffee shop with a line of customers.",
    "Someone doing arm circles as part of a warm-up in a fitness class at a gym.",
    "An individual performing arm swings in a park with autumn leaves on the ground.",
    "A person running on the spot in a home gym with a treadmill and weights.",
    "Someone doing butt kicks during a track workout on a sunny day at an athletic field.",
    "An individual performing a cross toe touch in a yoga studio with mats and calming decor.",
    "A person executing a side kick in a martial arts dojo with mirrors and training pads.",
    "Someone yawning widely in a cozy bedroom early in the morning with the sun rising.",
    "An individual stretching themselves in an office during a break with a coffee cup on the desk.",
    "A person blowing their nose with a tissue in a bright, airy living room with a large window.",
    "Someone hitting another person with a foam bat in a playful outdoor party setting.",
    "An individual wielding a knife towards another person in a dramatic theater rehearsal scene.",
    "A person knocking over another person during a friendly beach volleyball game.",
    "Someone grabbing another person’s hat in a playful manner at a sunny park.",
    "An individual shooting at another person with a water gun during a summer backyard party.",
    "A person stepping on someone’s foot accidentally in a crowded subway car.",
    "Someone high-fiving in a sports team huddle on a field with goals in the background.",
    "An individual cheering and drinking with friends at a rooftop bar with city lights.",
    "Two people carrying a couch together into a new apartment with boxes around.",
    "A person taking a photo of another person in front of a famous landmark during a trip.",
    "Someone following another person in a vast open room.",
    "An individual whispering in another person’s ear during a secret exchange in a library.",
    "Two people exchanging things in an industrial building.",
    "Someone supporting somebody with a hand during a difficult hiking trail with scenic views.",
    "Two individuals playing rock-paper-scissors in a schoolyard with children watching."
]

print(len(text_descriptions120))

120


The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
class_index_to_text_description = {index: description for index, description in enumerate(text_descriptions, 1)}
print(class_index_to_text_description)

{1: 'drink water', 2: 'eat meal', 3: 'brush teeth', 4: 'brush hair', 5: 'drop', 6: 'pick up', 7: 'throw', 8: 'sit down', 9: 'stand up', 10: 'clapping', 11: 'reading', 12: 'writing', 13: 'tear up paper', 14: 'put on jacket', 15: 'take off jacket', 16: 'put on a shoe', 17: 'take off a shoe', 18: 'put on glasses', 19: 'take off glasses', 20: 'put on a hat/cap', 21: 'take off a hat/cap', 22: 'cheer up', 23: 'hand waving', 24: 'kicking something', 25: 'reach into pocket', 26: 'hopping', 27: 'jump up', 28: 'phone call', 29: 'play with phone/tablet', 30: 'typing on a keyboard', 31: 'point to something', 32: 'taking a selfie', 33: 'check time (from watch)', 34: 'rub two hands', 35: 'nod head/bow', 36: 'shake head', 37: 'wipe face', 38: 'salute', 39: 'put palms together', 40: 'cross hands in front', 41: 'sneeze/cough', 42: 'staggering', 43: 'falling down', 44: 'headache', 45: 'chest pain', 46: 'back pain', 47: 'neck pain', 48: 'nausea/vomiting', 49: 'fan self', 50: 'punch/slapp', 51: 'kicking',

In [4]:
class_index_to_text_description = {index: description for index, description in enumerate(text_descriptions120, 1)}
print(class_index_to_text_description)

{1: 'A person drinking water from a clear glass in a kitchen.', 2: 'An individual eating a meal at a dining table, using a fork and knife.', 3: 'A person brushing teeth with a toothbrush in a bathroom mirror.', 4: 'Someone brushing long hair with a hairbrush in a bedroom.', 5: 'A person dropping a red ball onto a wooden floor in a living room.', 6: 'An individual picking up a blue book from the floor in a study room.', 7: 'A person throwing a white paper airplane in an office setting.', 8: 'Someone sitting down on a green armchair in a cozy room.', 9: 'An individual standing up from a metal chair in a cafeteria.', 10: 'A person clapping hands in an auditorium with a stage.', 11: 'Someone reading a hardcover book in a library with bookshelves.', 12: 'An individual writing in a notebook at a desk with a lamp.', 13: 'A person tearing up a sheet of paper over a trash bin in a workspace.', 14: 'Someone putting on a black jacket in a hallway with coat hangers.', 15: 'An individual taking off

In [3]:
import os
import json


# Path to the RGB videos directory
data_root = "/net/polaris/storage/deeplearning/ntu"
rgb_modality = 'nturgb+d_rgb'
annotation_list = []

# Directory containing RGB modality
rgb_path = os.path.join(data_root, rgb_modality)
for filename in os.listdir(rgb_path):
    # Extract the common prefix and the action label from the filename
    prefix, _ = os.path.splitext(filename)
    action_label = prefix.split('A')[-1].split('_')[0]

    # Convert to integer to remove leading zeros, then to string if your keys are strings
    action_label = int(action_label)
    # Use the action label to get the text description
    text_description = class_index_to_text_description.get(action_label, "Unknown action")

    # Create the annotation entry
    annotation_entry = {
        'clip_id': prefix,  # Common prefix as the clip ID
        'text': text_description
    }

    # Add the annotation entry to the list
    annotation_list.append(annotation_entry)

# Save the annotations to a JSONL file
annotation_file = "/home/bas06400/Thesis/VIP/src/developmentANDtest/annotations_rgb_words.jsonl"
with open(annotation_file, 'w') as f:
    for annotation in annotation_list:
        f.write(json.dumps(annotation) + '\n')

In [10]:
import json

#Cross Subject60 split

# Function to extract the subject ID from the file path
def get_subject_id(file_path):
    start = file_path.find('P') + 2
    end = file_path.find('R', start)
    return file_path[start:end]

# Splitting the dataset for cross-subject evaluation
with open('/home/bas06400/Thesis/CLIPVIP_Datasets/ntu_all_generated_descriptions.jsonl', 'r') as infile, \
     open('/home/bas06400/Thesis/CLIPVIP_Datasets/generated_descriptions_CS_train.jsonl', 'w') as train_file, \
     open('/home/bas06400/Thesis/CLIPVIP_Datasets/generated_descriptions_CS_val.jsonl', 'w') as val_file:

    for line in infile:
        entry = json.loads(line)
        # Assuming the subject ID is in the second file path in the entry
        file_paths = entry["clip_id"]
        subject_id = get_subject_id(file_paths)  # Zero-pad for consistency
        

        if subject_id in {'01', '02', '04', '05', '08', '09', '13', '14', '15', '16', '17', '18', '19', '25', '27', '28', '31', '34', '35', '38'}:
            train_file.write(json.dumps(entry) + "\n")
        else:
            val_file.write(json.dumps(entry) + "\n")


In [12]:
import json

#Cross Subject120 split

# Define the training subject IDs
training_subjects = {'001', '002', '004', '005', '008', '009', '013', '014', '015', '016', '017', '018', '019', '025', '027', '028', '031', '034', '035', '038', '045', 
                    '046', '047', '049', '050', '052', '053', '054', '055', '056', '057', '058', '059', '070', '074', '078', '080', '081', '082', '083', '084', 
                    '085', '086', '089', '091', '092', '093', '094', '095', '097', '098', '100', '103'}

# Function to extract the subject ID from the file path
def get_subject_id(file_path):
    start = file_path.find('P') + 1
    end = file_path.find('R', start)
    return file_path[start:end]

# Splitting the dataset for cross-subject evaluation
with open('/home/bas06400/annotations_rgb_120.jsonl', 'r') as infile, \
     open('/home/bas06400/Thesis/CLIPVIP_Datasets/CS120_training_set.jsonl', 'w') as train_file, \
     open('/home/bas06400/Thesis/CLIPVIP_Datasets/CS120_testing_set.jsonl', 'w') as val_file:

    for line in infile:
        entry = json.loads(line)
        # Assuming the subject ID is in the second file path in the entry
        file_paths = entry["clip_id"]
        subject_id = get_subject_id(file_paths)  # Zero-pad for consistency

        if subject_id in training_subjects:
            train_file.write(json.dumps(entry) + "\n")
        else:
            val_file.write(json.dumps(entry) + "\n")

In [6]:
import json

# Function to extract the camera ID from the clip_id
def get_camera_id(clip_id):
    start = clip_id.find('C') + 1
    end = clip_id.find('P', start)
    return clip_id[start:end]

# Splitting the dataset for cross-view evaluation
with open('/home/bas06400/Thesis/VIP/src/developmentANDtest/annotations_rgb_words.jsonl', 'r') as infile, \
     open('annotations_rgb_comp_CV_training_set_words.jsonl', 'w') as train_file, \
     open('annotations_rgb_comp_CV_testing_set_words.jsonl', 'w') as val_file:

    for line in infile:
        entry = json.loads(line)
        clip_id = entry['clip_id']
        camera_id = get_camera_id(clip_id).zfill(3)  # Zero-pad for consistency

        if camera_id in {'002', '003'}:
            train_file.write(json.dumps(entry) + "\n")
        elif camera_id == '001':
            val_file.write(json.dumps(entry) + "\n")


In [9]:
import json

# Function to extract the camera ID from the clip_id
def get_setup_id(clip_id):
    start = clip_id.find('S') + 1
    end = clip_id.find('C', start)
    return clip_id[start:end]

# Splitting the dataset for cross-view evaluation
with open('/home/bas06400/annotations_rgb_120.jsonl', 'r') as infile, \
     open('annotations_rgb_comp_CV_training_120set.jsonl', 'w') as train_file, \
     open('annotations_rgb_comp_CV_testing_120set.jsonl', 'w') as val_file:

    for line in infile:
            entry = json.loads(line)
            clip_id = entry['clip_id']
            setup_id = int(get_setup_id(clip_id).zfill(3))  # Zero-pad for consistency

            if setup_id % 2 == 0:  # Even setup IDs for training
                train_file.write(line)
            else:  # Odd setup IDs for testing
                val_file.write(line)

In [None]:
import os
import json
import random
from collections import defaultdict

def split_dataset(annotation_file, train_file, val_file, test_file, train_ratio=0.8, val_ratio=0.1, seed=42):
    # Read annotations
    with open(annotation_file, 'r') as f:
        annotations = [json.loads(line) for line in f]

    # Group annotations by class
    class_to_annotations = defaultdict(list)
    for annotation in annotations:
        class_to_annotations[annotation['text']].append(annotation)

    # Shuffle annotations within each class
    random.seed(seed)
    for annotations in class_to_annotations.values():
        random.shuffle(annotations)

    # Split annotations for each class
    train_annotations = []
    val_annotations = []
    test_annotations = []

    for class_annotations in class_to_annotations.values():
        n_total = len(class_annotations)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)

        train_annotations.extend(class_annotations[:n_train])
        val_annotations.extend(class_annotations[n_train:n_train + n_val])
        test_annotations.extend(class_annotations[n_train + n_val:])

    # Write splits to separate files
    for split_annotations, output_file in zip(
        [train_annotations, val_annotations, test_annotations],
        [train_file, val_file, test_file]
    ):
        with open(output_file, 'w') as f:
            for annotation in split_annotations:
                f.write(json.dumps(annotation) + '\n')

# Paths to the output files
data_root = "/home/bas06400/ntu"
annotation_file = os.path.join(data_root, "annotations_rgb.jsonl")
train_file = os.path.join(data_root, "annotations_train.jsonl")
val_file = os.path.join(data_root, "annotations_val.jsonl")
test_file = os.path.join(data_root, "annotations_test.jsonl")

# Execute the split
split_dataset(annotation_file, train_file, val_file, test_file)

In [None]:
from VIP.src.datasets.dataset_video_retrieval import HDVILAVideoRetrievalDataset
cfg = {
  "train_datasets": 
    {
      "name": "msrvtt-9k",
      "vis_format": "video",
      "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl",
      "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
    },
  "val_datasets": [

    {
      "name": "msrvtt-1ka",
      "vis_format": "video",
      "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
      "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
    }
  ],
  "inference_datasets": [
    {
      "name": "msrvtt-1ka",
      "vis_format": "video",
      "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
      "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
    }
  ],

  "train_n_clips": 1,
  "train_num_frms": 12,
  "test_n_clips": 1,
  "test_num_frms": 12,
  "sample_rate": 0,
  "sample_jitter": 1,
  "video_res": [240, 320],
  "input_res": [224, 224],
  "max_txt_len": 50,

  "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
  "clip_weights": "openai/clip-vit-base-patch16",
  "clip_config": "openai/clip-vit-base-patch16",
  "clip_vision_additional_config": {
      "type": "ViP",
      "temporal_size": 12,
      "if_use_temporal_embed": 1,
      "logit_scale_init_value": 4.60,
      "add_cls_num": 3
  },

  "train_batch_size": 16,
  "test_batch_size": 16,
  "max_n_example_per_group": 1,
  "gradient_accumulation_steps": 1,
  "n_workers": 8,
  "pin_mem": 1,
  "fp16": 1,
  "amp_level": "O2",
  "seed": 42,

  "optim": "adamw",
  "betas": [0.9, 0.98],
  "learning_rate": 1e-6,
  "weight_decay": 0.2,
  "lr_mul": 1,
  "lr_mul_prefix": "",
  "loss_config": {
    "loss_name": "NCELearnableTempLoss",
    "if_gather": 1
  },
  "warmup_ratio": 0.01,
  "decay": "cosine",
  "grad_norm": 1.0,

  "num_train_epochs": 100,
  "min_valid_steps": 1,
  "num_valid": 1,
  "only_valid_steps": 100,
  "save_steps_ratio": 0.9,
  "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_16",
  "if_tb_log": 0,
  "if_model_saver": 1,
  "if_log2file": 1,
  "dummy_data": 0
}


vis_dir = '/home/bas06400/ntu/nturgb+d_rgb'
anno_path = 'ntu/annotations_train.jsonl'

dataset = HDVILAVideoRetrievalDataset(cfg, vis_dir, anno_path, vis_format='video', mode="train")

ModuleNotFoundError: No module named 'horovod'

ImportError: cannot import name 'SingleFrameVideoDataset' from 'multimodal_dataset' (/home/bas06400/Thesis/VIP/src/multimodal_dataset.py)