In [1]:
import random
import torch
from multimodal_dataset import SingleFrameVideoDataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader

# Set the seed for reproducibility
seed = 42
random.seed(seed)  # Seed for Python's random module
torch.manual_seed(seed)  # Seed for PyTorch random number generators

data_root = '/net/polaris/storage/deeplearning/ntu'
data_list = '/home/bas06400/Thesis/rgb_ir_dataset.txt'
# Initialize the single frame dataset
single_frame_data = SingleFrameVideoDataset(data_list, data_root, ['rgb','ir'], use_advanced_processing=True)

# Let's check the shape for the single frame dataset
print(single_frame_data[0][0]['rgb'].shape, single_frame_data[0][0]['ir'].shape, single_frame_data[0][1])

# Calculate lengths of splits
total_len = len(single_frame_data)
train_len = int(0.8 * total_len)
val_len = int(0.1 * total_len)
test_len = total_len - train_len - val_len

# Split the single frame dataset
train_data, val_data, test_data = random_split(single_frame_data, [train_len, val_len, test_len])

  mean = torch.tensor(self.mean).view(-1, 1, 1)  # Adjust shape for broadcasting
  std = torch.tensor(self.std).view(-1, 1, 1)


torch.Size([3, 224, 224]) torch.Size([1, 224, 224]) 1


In [2]:
def custom_collate_fn(batch):
    """
    Custom collate function to handle batches of data from MultiModalVideoDataset.
    
    Args:
    - batch (list): List of samples fetched from `MultiModalVideoDataset`.
    
    Returns:
    - collated_data (dict): Collated data for each modality.
    - collated_labels (tensor): Collated labels.
    """
    collated_data = {}
    collated_labels = []
    collated_index = []
    
    # Initialize empty lists for each modality in the first sample
    for modality in batch[0][0].keys():
        collated_data[modality] = []
    
    for data, label in batch:
        collated_labels.append(label-1)
        for modality, frames in data.items():
            collated_data[modality].append(frames)
        
    # Convert lists to tensors for each modality
    for modality, frames_list in collated_data.items():
        collated_data[modality] = torch.stack(frames_list)
    
    collated_labels = torch.tensor(collated_labels)
    
    return collated_data, collated_labels


# Create a DataLoader
batch_size = 64
shuffle = True
num_workers = 10
pin_memory = True

# Create a DataLoader for the training set
train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)

# Create a DataLoader for the validation set
val_loader = DataLoader(
    val_data,
    batch_size=batch_size,  
    shuffle=False,  
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)

# Create a DataLoader for the test set
test_loader = DataLoader(
    test_data,
    batch_size=batch_size,  
    shuffle=False,  
    num_workers=num_workers,
    pin_memory=pin_memory,
    collate_fn=custom_collate_fn
)

In [3]:
from transformers import CLIPModel, CLIPTokenizer
from transformers.models.clip.configuration_clip import CLIPConfig

# Load the pre-trained CLIP model and tokenizer
clip_model_name = "openai/clip-vit-base-patch16"
model = CLIPModel.from_pretrained(clip_model_name).to('cuda:3')
tokenizer = CLIPTokenizer.from_pretrained(clip_model_name)

text_descriptions = [
    "drink water.",
    "eat meal/snack.",
    "brushing teeth.",
    "brushing hair.",
    "drop.",
    "pickup.",
    "throw.",
    "sitting down.",
    "standing up (from sitting position).",
    "clapping.",
    "reading.",
    "writing.",
    "tear up paper.",
    "wear jacket.",
    "take off jacket.",
    "wear a shoe.",
    "take off a shoe.",
    "wear on glasses.",
    "take off glasses.",
    "put on a hat/cap.",
    "take off a hat/cap.",
    "cheer up.",
    "hand waving.",
    "kicking something.",
    "reach into pocket.",
    "hopping (one foot jumping).",
    "jump up.",
    "make a phone call/answer phone.",
    "playing with phone/tablet.",
    "typing on a keyboard.",
    "pointing to something with finger.",
    "taking a selfie.",
    "check time (from watch).",
    "rub two hands together.",
    "nod head/bow.",
    "shake head.",
    "wipe face.",
    "salute.",
    "put the palms together.",
    "cross hands in front (say stop).",
    "sneeze/cough.",
    "staggering.",
    "falling.",
    "touch head (headache).",
    "touch chest (stomachache/heart pain).",
    "touch back (backache).",
    "touch neck (neckache).",
    "nausea or vomiting condition.",
    "use a fan (with hand or paper)/feeling warm.",
    "punching/slapping other person.",
    "kicking other person.",
    "pushing other person.",
    "pat on back of other person.",
    "point finger at the other person.",
    "hugging other person.",
    "giving something to other person.",
    "touch other person's pocket.",
    "handshaking.",
    "walking towards each other.",
    "walking apart from each other."
]

# Tokenize the text descriptions
text_inputs = tokenizer(text_descriptions, return_tensors="pt", padding=True, truncation=True).to('cuda:3')

# Obtain text embeddings using the text model of CLIP
with torch.no_grad():
    text_outputs = model.get_text_features(**text_inputs)
    text_embeddings = text_outputs

print(text_embeddings.shape)



torch.Size([60, 512])


In [4]:

import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

def batch_cosine_similarity(x1, x2):
    # x1 has shape (batch_size, embed_dim)
    # x2 has shape (num_text_descriptions, embed_dim)
    dot = x1 @ x2.T
    norm1 = torch.norm(x1, p=2, dim=1).unsqueeze(1)
    norm2 = torch.norm(x2, p=2, dim=1).unsqueeze(0)
    return dot / (norm1 * norm2)

correct_rgb_predictions = 0
total_samples = 0

# Assume text_embeddings are computed outside this code block and are on the same device
text_embeddings = text_embeddings.to('cuda:3')

# This loop assumes that test_loader is defined and provides batches of (rgb_data, batch_labels)
for batch_data, batch_labels in tqdm(test_loader, desc="Evaluating", ncols=100):
    # Move data to the appropriate device
    rgb_data = batch_data['rgb'].to('cuda:3')
    batch_labels = batch_labels.to('cuda:3')

    model.eval()
    # Extract embeddings for the RGB data from the model
    rgb_emb = model.get_image_features(rgb_data)
    
    # Compute cosine similarities for the RGB embeddings
    similarities_rgb = batch_cosine_similarity(rgb_emb, text_embeddings)
    
    # Get predicted classes by finding the index of the max similarity
    predicted_class_rgb = torch.argmax(similarities_rgb, dim=1)
    
    # Update correct predictions count
    correct_rgb_predictions += (predicted_class_rgb == batch_labels).sum().item()
    
    # Update total samples count
    total_samples += batch_labels.size(0)

# Compute accuracy
accuracy_rgb = correct_rgb_predictions / total_samples
print(f"RGB Accuracy: {accuracy_rgb * 100:.2f}%")

Evaluating: 100%|███████████████████████████████████████████████████| 89/89 [06:38<00:00,  4.48s/it]

RGB Accuracy: 2.36%



