In [None]:
#GPU Memory Management and cache clear Code

In [1]:
import torch
import gc

class GPUMemoryManager:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    def clear_cache(self):
        """Clear GPU cache to free memory"""
        torch.cuda.empty_cache()
        gc.collect()
    
    def get_memory_usage(self):
        """Monitor GPU memory usage"""
        allocated = torch.cuda.memory_allocated() / 1e9
        cached = torch.cuda.memory_reserved() / 1e9
        return f"Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB"
    
    def optimize_batch_size(self, base_batch_size=32):
        """Automatically determine optimal batch size"""
        try:
            # Test with base batch size
            dummy_input = torch.randn(base_batch_size, 3, 224, 224).cuda()
            del dummy_input
            self.clear_cache()
            return base_batch_size
        except RuntimeError as e:
            if "out of memory" in str(e):
                return self.optimize_batch_size(base_batch_size // 2)
            else:
                raise e

# Usage
gpu_manager = GPUMemoryManager()
optimal_batch_size = gpu_manager.optimize_batch_size()
print(f"Optimal batch size: {optimal_batch_size}")

GPU: NVIDIA RTX 2000 Ada Generation
VRAM: 16.7 GB
Optimal batch size: 32


In [None]:
#Memory-Efficient Training Configuration

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler, autocast
import gc

# Config for your RTX 2000 Ada
training_config = {
    'batch_size': 32,  # Updated from 16
    'accumulation_steps': 2,  # Effective batch size = 64, as before
    'mixed_precision': True,
    'gradient_checkpointing': True,
    'pin_memory': True,
    'num_workers': 4,
    'max_image_size': 224,
}

# Scaler for mixed precision
scaler = GradScaler(device='cuda')

def print_memory_stats(step_tag=""):
    allocated = torch.cuda.memory_allocated() / 1024**2
    reserved = torch.cuda.memory_reserved() / 1024**2
    print(f"[{step_tag}] GPU Memory: Allocated={allocated:.2f}MB | Reserved={reserved:.2f}MB")

# Simulated loss function (replace with real one)
def compute_loss(outputs, targets):
    return nn.functional.mse_loss(outputs, targets)

# Example training step
def train_step(model, batch, optimizer, step):
    model.train()

    if training_config['mixed_precision']:
        print(f"[Step {step}] Mixed precision enabled ✔")

    # Print memory before forward
    print_memory_stats(f"Step {step} - Before Forward")

    with autocast(dtype=torch.float16):
        outputs = model(batch['images'], batch['texts'])
        loss = compute_loss(outputs, batch['targets'])
        loss = loss / training_config['accumulation_steps']
        print(f"[Step {step}] Scaled Loss: {loss.item():.6f}")

    scaler.scale(loss).backward()

    # Print memory after backward
    print_memory_stats(f"Step {step} - After Backward")

    if (step + 1) % training_config['accumulation_steps'] == 0:
        print(f"[Step {step}] Performing optimizer step...")
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        # Force garbage collection (optional but useful for debugging memory)
        gc.collect()
        torch.cuda.empty_cache()

        print_memory_stats(f"Step {step} - After Optimizer Step")


In [9]:
#Efficient Data Loading

In [2]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms

# Memory-efficient transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Start small
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Optimized DataLoader for your system
def create_dataloader(dataset, batch_size=16):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,  # Good for your CPU
        pin_memory=True,  # Faster GPU transfer
        persistent_workers=True,  # Reuse workers
        prefetch_factor=2  # Prefetch batches
    )

In [3]:
## Auto-pick or manually assign GPU 

In [2]:
from utils.gpu_manager import GPUMemoryManager

# Auto-pick or manually assign GPU (e.g., 0 = RTX 2000 Ada, 1 = RTX 6000)
gpu_manager = GPUMemoryManager(preferred_device=None)
device = gpu_manager.get_device()

print(f"Running on device: {device}")


✅ Auto-selected GPU: 0
Running on device: cuda:0


In [3]:
#Load Configuration file

In [4]:
import yaml

with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print(config)

{'project_name': 'Lightweight Cross-Modal Attention', 'preferred_device': None, 'seed': 42, 'datasets_path': './datasets/', 'coco_annotations': 'coco_subset/annotations/captions_train2017.json', 'coco_val_annotations': 'coco_subset/annotations/captions_val2017.json', 'vqa_annotations': 'vqa2/v2_mscoco_train2014_annotations.json', 'vqa_questions': 'vqa2/v2_OpenEnded_mscoco_train2014_questions.json', 'nocaps_json': 'nocaps/nocaps_test_public.json', 'flickr_csv': 'flickr30k_images_andCaptions/flickr30k_captions.csv', 'rank_k': 32, 'input_dim': 512, 'epochs': 10, 'learning_rate': 5e-05, 'batch_size': 32}


In [5]:
# Test the loader in your Jupyter Notebook

In [6]:
import yaml
from data_loader import CrossModalDatasetLoader

# Load config
with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Instantiate loader
loader = CrossModalDatasetLoader(config)

# Load COCO dataset
coco_dataset = loader.load_coco()

# Test one sample
sample = coco_dataset[0]

print("Image shape:", sample['image'].shape)
print("Tokenized caption:", sample['input_ids'])


Loaded 414113 valid COCO samples, skipped 177640 missing images.
Image shape: torch.Size([3, 224, 224])
Tokenized caption: tensor([ 101, 1037, 2200, 4550, 1998, 2092, 7429, 4064, 5723,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0])


In [6]:
#Quick Sanity Test before training

In [7]:
import yaml
import torch
from data_loader import CrossModalDatasetLoader
from models.multimodal_model import CrossModalModel
from utils.gpu_manager import GPUMemoryManager

# Load config
with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup device
gpu_manager = GPUMemoryManager(preferred_device=None)
device = gpu_manager.get_device()

# Load dataset
loader = CrossModalDatasetLoader(config)
dataset = loader.load_coco(split="train")

# Take a small batch (simulate dataloader)
sample = dataset[0]

# Build batch manually for test
batch = {
    'image': sample['image'].unsqueeze(0),        # add batch dimension (1, 3, 224, 224)
    'input_ids': sample['input_ids'].unsqueeze(0) # (1, 50)
}

# Load model
model = CrossModalModel(device=device, rank_k=config['rank_k']).to(device)

# Forward pass
scores = model(batch)

print("Output score:", scores)


✅ Auto-selected GPU: 0
Loaded 414113 valid samples, skipped 177640 missing images.
Output score: tensor([-0.1207], device='cuda:0', grad_fn=<SqueezeBackward1>)


In [8]:
# Training Code 

In [10]:
import yaml
from data_loader import CrossModalDatasetLoader
from models.multimodal_model import CrossModalModel
from trainer.train import Trainer
from utils.gpu_manager import GPUMemoryManager

# Load config
with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# GPU manager
gpu_manager = GPUMemoryManager(preferred_device=None)
device = gpu_manager.get_device()

# Dataset
loader = CrossModalDatasetLoader(config)
dataset = loader.load_coco(split="train")

# Model
model = CrossModalModel(device=device, rank_k=config['rank_k']).to(device)

# Trainer
trainer = Trainer(model, dataset, config, device)
trainer.train()


✅ Auto-selected GPU: 0
Loaded 414113 valid samples, skipped 177640 missing images.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [1] finished. Average Loss: 0.0065


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [2] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [3] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [4] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [5] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [6] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [7] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [8] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [9] finished. Average Loss: 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [10] finished. Average Loss: 0.0000




In [None]:
# Evaluvation

In [None]:
# Retrieval Evaluation (COCO)

In [6]:
# Cell 1: Imports
import torch
import yaml
from data_loader import CrossModalDatasetLoader
from models.multimodal_model import CrossModalModel
from evaluation.retrieval_evaluator import RetrievalEvaluator
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Cell 2: Load Config
with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Cell 3: Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Cell 4: Load Dataset
loader = CrossModalDatasetLoader(config)
coco_dataset = loader.load_coco(split="val")

# Cell 5: Load Model (Make sure you trained your model first)
model = CrossModalModel(device=device, rank_k=config['rank_k']).to(device)
model.eval()

# Cell 6: Run Retrieval Evaluation
retrieval_eval = RetrievalEvaluator(model, coco_dataset, device, save_dir="./results/retrieval/coco")
retrieval_eval.evaluate()


Device: cuda
Loaded 25014 valid samples, skipped 0 missing images.


Encoding dataset: 100%|██████████████████| 25014/25014 [03:50<00:00, 108.72it/s]


{'Recall@1': 3.997761253697929e-05, 'Recall@5': 0.00019988806268489645, 'Recall@10': 0.0003997761253697929}


In [8]:
import torch
from models.multimodal_model import CrossModalModel

# Load config again
import yaml
with open('./configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Rebuild model
model = CrossModalModel(device=device, rank_k=config['rank_k']).to(device)



In [None]:
# Load Flickr30K dataset
flickr_dataset = loader.load_flickr30k()

# Import evaluator (only if not already imported)
from evaluation.retrieval_evaluator import RetrievalEvaluator

# Run retrieval evaluation for Flickr30K
retrieval_eval = RetrievalEvaluator(model, flickr_dataset, device, save_dir="./results/retrieval/flickr")
retrieval_eval.evaluate()


✅ Flickr30K Columns: Index(['image_file', 'caption'], dtype='object')


Encoding dataset: 100%|████████████████| 155070/155070 [21:05<00:00, 122.57it/s]


In [None]:
# Captioning Evaluation (NoCaps)
# Generated captions JSON: "./results/generated_captions.json"
# Ground truth JSON: "./datasets/nocaps/ground_truth.json"

In [2]:
# Load both JSONs
with open("./datasets/nocaps/ground_truth_coco_val2017.json") as f:
    gt_data = json.load(f)
with open("./results/generated_captions_coco_val2017.json") as f:
    gen_data = json.load(f)

# Sample 5 keys from both sides
print("Ground Truth Sample Keys:", list(gt_data.keys())[:5])
print("Generated Captions Sample Keys:", list(gen_data.keys())[:5])



Ground Truth Sample Keys: ['179765', '190236', '331352', '517069', '182417']
Generated Captions Sample Keys: ['COCO_val2017_000000179765.jpg', 'COCO_val2017_000000190236.jpg', 'COCO_val2017_000000331352.jpg', 'COCO_val2017_000000517069.jpg', 'COCO_val2017_000000182417.jpg']


In [1]:
# (1) Generate ground truth (already done)
%run ./utils/generate_coco_val2017_groundtruth.py

# (2) Generate captions (new updated code above)
%run ./inference/caption_generator_coco_val2017.py

from evaluation.captioning_evaluator import CaptioningEvaluator
import os

# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Provide the paths
caption_eval = CaptioningEvaluator(
    ground_truth_path="./datasets/nocaps/ground_truth_coco_val2017.json",
    generated_captions_path="./results/generated_captions_coco_val2017.json",
    save_dir="./results/captioning/"
)

# Run evaluation
caption_eval.evaluate()



✅ Ground truth file created at: ./datasets/nocaps/ground_truth_coco_val2017.json
✅ Auto-selected GPU: 0


Generating captions: 100%|██████████████████| 5000/5000 [02:38<00:00, 31.49it/s]


✅ Generated captions saved at ./results/generated_captions_coco_val2017.json


[nltk_data] Downloading package wordnet to /home/loom/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/loom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/loom/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Total generated captions: 5000
Total ground truth captions: 5000
Common samples found for evaluation: 5000


Evaluating Samples: 100%|██████████████████| 5000/5000 [00:06<00:00, 791.79it/s]


{'BLEU': 1.0, 'METEOR': 0.9995758906089781, 'CIDEr': 10.0}


In [None]:
#VQA Evaluation (VQAv2 or OK-VQA)
#Assuming you already have:
#Ground truth: "./datasets/vqa2/ground_truth.json"
#Model predictions: "./results/vqa_predictions.json"

In [None]:
from evaluation.vqa_evaluator import VQAEvaluator
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Create evaluator
vqa_eval = VQAEvaluator(
    ground_truth_path="./datasets/vqa2/ground_truth.json",
    predictions_path="./results/vqa_predictions.json",
    save_dir="./results/vqa/"
)

# Run evaluation
vqa_eval.evaluate()

In [None]:
#Efficiency Evaluation

In [None]:
from evaluation.efficiency_evaluator import EfficiencyEvaluator
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Create evaluator
eff_eval = EfficiencyEvaluator(model, save_dir="./results/efficiency/")

# Run evaluation
eff_eval.evaluate()

In [None]:
#Run Full Ablation Study
#This will automatically launch 5 full experiments back-to-back

In [None]:
from automation.ablation_runner import AblationRunner

runner = AblationRunner()
runner.run_ablation(rank_list=[16, 32, 64, 128, 256])