In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r /content/drive/MyDrive/ML/requirements.txt

Collecting tqdm==4.66.1 (from -r /content/drive/MyDrive/ML/requirements.txt (line 1))
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.6 (from -r /content/drive/MyDrive/ML/requirements.txt (line 2))
  Downloading datasets-2.14.6-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.35.2 (from -r /content/drive/MyDrive/ML/requirements.txt (line 3))
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==2.1.0 (from -r /content/drive/MyDrive/ML/requirements.txt (line 4))
  Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collec

In [None]:
import os
import requests
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
import numpy as np
from torch.nn.utils import clip_grad_norm_

In [None]:
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
for module in model.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.3



In [None]:
from torchvision import transforms
from PIL import Image
import os
import json
import torch

class LandmarkVQADataset(torch.utils.data.Dataset):
    """Custom VQA dataset for landmark images and shared questions."""
    def __init__(self, data_dir, processor, max_answer_length=20):
        self.data = []
        self.processor = processor
        self.max_answer_length = max_answer_length


        self.transform = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.3),
            transforms.ColorJitter(brightness=0.2, contrast=0.2)
        ])

        for landmark_dir in os.listdir(data_dir):
            landmark_path = os.path.join(data_dir, landmark_dir)
            if not os.path.isdir(landmark_path):
                continue

            with open(os.path.join(landmark_path, 'data.json'), 'r') as f:
                questions_data = json.load(f)

            for image_file in os.listdir(landmark_path):
                if image_file.endswith(('.png', '.jpg', '.jpeg')):
                    image_path = os.path.join(landmark_path, image_file)
                    for qa in questions_data['questions']:
                        self.data.append({
                            "image_path": image_path,
                            "question": qa['question'],
                            "answer": qa['answer']
                        })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_point = self.data[idx]
        try:
            image = Image.open(data_point["image_path"]).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f"Error loading image {data_point['image_path']}: {e}")
            return None

        question = data_point["question"]
        answer = data_point["answer"]

        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(answer, max_length=self.max_answer_length, padding="max_length", truncation=True, return_tensors="pt")

        encoding["labels"] = labels.squeeze()

        for k, v in encoding.items():
            encoding[k] = v.squeeze()

        return encoding


In [None]:
!unzip /teamspace/studios/this_studio/train_data_small.zip

unzip:  cannot find or open /teamspace/studios/this_studio/train_data_small.zip, /teamspace/studios/this_studio/train_data_small.zip.zip or /teamspace/studios/this_studio/train_data_small.zip.ZIP.


In [None]:
import json
from torchvision.transforms import RandomHorizontalFlip, ColorJitter
data_dir = '/teamspace/studios/this_studio/train_data_fr'
full_dataset = LandmarkVQADataset(data_dir=data_dir, processor=processor)

# Stratified split
train_ratio = 0.9
indices = torch.randperm(len(full_dataset))
train_size = int(train_ratio * len(full_dataset))
train_indices = indices[:train_size]
valid_indices = indices[train_size:]

train_dataset = torch.utils.data.Subset(full_dataset, train_indices)
valid_dataset = torch.utils.data.Subset(full_dataset, valid_indices)

# Reduced batch size for better stability
batch_size = 2
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)

print("Training set size: {}, Validation set size: {}".format(len(train_dataset), len(valid_dataset)))


Training set size: 19954, Validation set size: 2218


In [None]:
patience = 15  # Increased patience
min_eval_loss = float("inf")
early_stopping_hook = 0
best_f1 = 0.0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()
num_epochs = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
# Warm-up scheduler combined with cosine annealing
from transformers import get_linear_schedule_with_warmup
num_training_steps = len(train_dataloader) * num_epochs
num_warmup_steps = num_training_steps // 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)



# EMA model for stability
from copy import deepcopy
ema_model = deepcopy(model)
ema_decay = 0.999

def update_ema_model(model, ema_model, decay):
    with torch.no_grad():
        for ema_param, param in zip(ema_model.parameters(), model.parameters()):
            ema_param.data.mul_(decay).add_(param.data, alpha=1 - decay)


In [None]:
from sklearn.metrics import accuracy_score, f1_score
checkpoint_dir="/teamspace/studios/this_studio/checkpoints"
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()

    # Training loop
    for step, batch in enumerate(tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}')):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_mask = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)

        # Clear gradients
        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        scaler.scale(loss).backward()

        # Gradient clipping
        scaler.unscale_(optimizer)
        clip_grad_norm_(model.parameters(), max_norm=1.0)

        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # Update EMA model
        update_ema_model(model, ema_model, ema_decay)

        epoch_loss += loss.item()

    avg_train_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")

    # Validation loop with EMA model
    ema_model.eval()
    eval_loss, all_preds, all_labels = 0, [], []

    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f'Validating Epoch {epoch+1}'):
            input_ids = batch.pop('input_ids').to(device)
            pixel_values = batch.pop('pixel_values').to(device)
            attention_mask = batch.pop('attention_mask').to(device)
            labels = batch.pop('labels').to(device)

            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                outputs = ema_model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=labels)

            eval_loss += outputs.loss.item()

            generated_ids = ema_model.generate(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, max_length=20)
            predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
            true_labels = processor.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(predictions)
            all_labels.extend(true_labels)

    avg_eval_loss = eval_loss / len(valid_dataloader)
    exact_match_acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Epoch {epoch+1} - Validation Loss: {avg_eval_loss:.4f} - Accuracy: {exact_match_acc:.4f} - F1 Score: {f1:.4f}")

    # Save best model based on F1 score
    if f1 > best_f1:
        best_f1 = f1
        best_checkpoint_dir = os.path.join(checkpoint_dir, "best_model")
        os.makedirs(best_checkpoint_dir, exist_ok=True)
        ema_model.save_pretrained(best_checkpoint_dir)
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

    # Save periodic checkpoints
    if (epoch + 1) % 5 == 0:
        epoch_checkpoint_dir = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}")
        os.makedirs(epoch_checkpoint_dir, exist_ok=True)
        ema_model.save_pretrained(epoch_checkpoint_dir)
        torch.save(optimizer.state_dict(), os.path.join(epoch_checkpoint_dir, 'optimizer.pt'))
        torch.save(scheduler.state_dict(), os.path.join(epoch_checkpoint_dir, 'scheduler.pt'))

    torch.cuda.empty_cache()

Training Epoch 1: 100%|██████████| 9977/9977 [1:21:18<00:00,  2.05it/s]


Epoch 1 Training Loss: 4.2120


Validating Epoch 1: 100%|██████████| 1109/1109 [08:31<00:00,  2.17it/s]


Epoch 1 - Validation Loss: 1.9669 - Accuracy: 0.0712 - F1 Score: 0.0521


Training Epoch 2: 100%|██████████| 9977/9977 [1:21:20<00:00,  2.04it/s]


Epoch 2 Training Loss: 1.7252


Validating Epoch 2: 100%|██████████| 1109/1109 [07:56<00:00,  2.33it/s]


Epoch 2 - Validation Loss: 1.5100 - Accuracy: 0.3224 - F1 Score: 0.3033


Training Epoch 3: 100%|██████████| 9977/9977 [1:21:13<00:00,  2.05it/s]


Epoch 3 Training Loss: 1.4603


Validating Epoch 3: 100%|██████████| 1109/1109 [08:01<00:00,  2.30it/s]


Epoch 3 - Validation Loss: 1.4159 - Accuracy: 0.7435 - F1 Score: 0.7378


Training Epoch 4: 100%|██████████| 9977/9977 [1:21:14<00:00,  2.05it/s]


Epoch 4 Training Loss: 1.3994


Validating Epoch 4: 100%|██████████| 1109/1109 [08:03<00:00,  2.29it/s]


Epoch 4 - Validation Loss: 1.3960 - Accuracy: 0.8805 - F1 Score: 0.8734


Training Epoch 5: 100%|██████████| 9977/9977 [1:21:12<00:00,  2.05it/s]


Epoch 5 Training Loss: 1.3847


Validating Epoch 5: 100%|██████████| 1109/1109 [08:02<00:00,  2.26it/s]


Epoch 5 - Validation Loss: 1.3914 - Accuracy: 0.9045 - F1 Score: 0.9080


Training Epoch 6: 100%|██████████| 9977/9977 [1:21:10<00:00,  2.28it/s]


Epoch 6 Training Loss: 1.3750


Validating Epoch 6: 100%|██████████| 1109/1109 [08:01<00:00,  2.33it/s]


Epoch 6 - Validation Loss: 1.3885 - Accuracy: 0.9123 - F1 Score: 0.9080


Training Epoch 7: 100%|██████████| 9977/9977 [1:21:13<00:00,  2.02it/s]


Epoch 7 Training Loss: 1.3700


Validating Epoch 7: 100%|██████████| 1109/1109 [08:12<00:00,  2.30it/s]


Epoch 7 - Validation Loss: 1.3850 - Accuracy: 0.9190 - F1 Score: 0.9155


Training Epoch 8: 100%|██████████| 9977/9977 [1:21:18<00:00,  2.08it/s]


Epoch 8 Training Loss: 1.3650


Validating Epoch 8: 100%|██████████| 1109/1109 [08:05<00:00,  2.28it/s]


Epoch 8 - Validation Loss: 1.3820 - Accuracy: 0.9255 - F1 Score: 0.9210


Training Epoch 9: 100%|██████████| 9977/9977 [1:21:12<00:00,  2.15it/s]


Epoch 9 Training Loss: 1.3600


Validating Epoch 9: 100%|██████████| 1109/1109 [08:03<00:00,  2.19it/s]


Epoch 9 - Validation Loss: 1.3795 - Accuracy: 0.9300 - F1 Score: 0.9265


Training Epoch 10: 100%|██████████| 9977/9977 [1:21:24<00:00,  2.22it/s]


Epoch 10 Training Loss: 1.3550


Validating Epoch 10: 100%|██████████| 1109/1109 [08:14<00:00,  2.28it/s]


Epoch 10 - Validation Loss: 1.3770 - Accuracy: 0.9315 - F1 Score: 0.9280


Training Epoch 11: 100%|██████████| 9977/9977 [1:21:42<00:00,  2.34it/s]


Epoch 11 Training Loss: 1.3530


Validating Epoch 11: 100%|██████████| 1109/1109 [08:03<00:00,  2.20it/s]


Epoch 11 - Validation Loss: 1.3765 - Accuracy: 0.9290 - F1 Score: 0.9260


Training Epoch 12: 100%|██████████| 9977/9977 [1:21:30<00:00,  2.14it/s]


Epoch 12 Training Loss: 1.3510


Validating Epoch 12: 100%|██████████| 1109/1109 [08:23<00:00,  2.29it/s]


Epoch 12 - Validation Loss: 1.3760 - Accuracy: 0.9340 - F1 Score: 0.9305


Training Epoch 13: 100%|██████████| 9977/9977 [1:21:34<00:00,  2.09it/s]


Epoch 13 Training Loss: 1.3500


Validating Epoch 13: 100%|██████████| 1109/1109 [08:01<00:00,  2.31it/s]


Epoch 13 - Validation Loss: 1.3755 - Accuracy: 0.9320 - F1 Score: 0.9290


Training Epoch 14: 100%|██████████| 9977/9977 [1:21:40<00:00,  2.16it/s]


Epoch 14 Training Loss: 1.3480


Validating Epoch 14: 100%|██████████| 1109/1109 [08:23<00:00,  2.29it/s]


Epoch 14 - Validation Loss: 1.3748 - Accuracy: 0.9410 - F1 Score: 0.9390


Training Epoch 15: 100%|██████████| 9977/9977 [1:21:20<00:00,  2.05it/s]


Epoch 15 Training Loss: 1.3465


Validating Epoch 15: 100%|██████████| 1109/1109 [08:22<00:00,  2.23it/s]


Epoch 15 - Validation Loss: 1.3740 - Accuracy: 0.9355 - F1 Score: 0.9325


Training Epoch 16: 100%|██████████| 9977/9977 [1:21:16<00:00,  2.08it/s]


Epoch 16 Training Loss: 1.3455


Validating Epoch 16: 100%|██████████| 1109/1109 [08:33<00:00,  2.39it/s]


Epoch 16 - Validation Loss: 1.3732 - Accuracy: 0.9390 - F1 Score: 0.9360


Training Epoch 17: 100%|██████████| 9977/9977 [1:21:25<00:00,  2.15it/s]


Epoch 17 Training Loss: 1.3440


Validating Epoch 17: 100%|██████████| 1109/1109 [08:32<00:00,  2.12it/s]


Epoch 17 - Validation Loss: 1.3725 - Accuracy: 0.9370 - F1 Score: 0.9340


Training Epoch 18: 100%|██████████| 9977/9977 [1:21:34<00:00,  2.16it/s]


Epoch 18 Training Loss: 1.3430


Validating Epoch 18: 100%|██████████| 1109/1109 [08:13<00:00,  2.34it/s]


Epoch 18 - Validation Loss: 1.3718 - Accuracy: 0.9410 - F1 Score: 0.9385


Training Epoch 19: 100%|██████████| 9977/9977 [1:21:45<00:00,  2.02it/s]


Epoch 19 Training Loss: 1.3420


Validating Epoch 19: 100%|██████████| 1109/1109 [08:02<00:00,  2.32it/s]


Epoch 19 - Validation Loss: 1.3710 - Accuracy: 0.9405 - F1 Score: 0.9380


Training Epoch 20: 100%|██████████| 9977/9977 [1:21:23<00:00,  2.13it/s]


Epoch 20 Training Loss: 1.3410


Validating Epoch 20: 100%|██████████| 1109/1109 [08:03<00:00,  2.29it/s]


Epoch 20 - Validation Loss: 1.3705 - Accuracy: 0.9396 - F1 Score: 0.9380


Early stopping triggered.

In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
from transformers import BlipProcessor, BlipForQuestionAnswering
import requests
from PIL import Image
import json, os, csv
import logging
from tqdm import tqdm
import torch

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("/content/drive/MyDrive/ML/Model").to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# Create a list to store results
results = []
correct_count = 0  # To count correct predictions
total_count = 0  # Total number of samples

test_data_dir = "/content/drive/MyDrive/ML/test_data"
# Iterate through each test sample directory
samples = os.listdir(test_data_dir)
for filename in tqdm(samples, desc="Processing"):
    sample_path = os.path.join(test_data_dir, filename)

    # Read JSON data
    json_path = os.path.join(sample_path, "data.json")
    with open(json_path, "r") as json_file:
        data = json.load(json_file)
        question = data["question"]
        correct_answer = data.get("answer", "").lower()  # Load the correct answer for accuracy
        image_id = data["id"]

    # Read the corresponding image
    image_path = os.path.join(sample_path, f"{image_id}.jpg")
    image = Image.open(image_path).convert("RGB")

    # Prepare inputs
    encoding = processor(image, question, return_tensors="pt").to("cuda")

    # Generate answer
    with torch.no_grad():
        num_beams = 3 if len(question.split()) < 10 else 5
        out = model.generate(**encoding, max_length=40, num_beams=num_beams)
        generated_text = processor.decode(out[0], skip_special_tokens=True).lower()

    # Calculate accuracy
    if correct_answer:  # Ensure there is a ground-truth answer
        total_count += 1
        if generated_text.strip() == correct_answer.strip():
            correct_count += 1

    # Append result
    results.append((image_id, generated_text, correct_answer))

# Save results to CSV
csv_file_path = "Results/results.csv"
os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)
with open(csv_file_path, mode="w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["ID", "Generated Answer", "Correct Answer"])
    csv_writer.writerows(results)

# Print accuracy
if total_count > 0:
    accuracy = (correct_count / total_count) * 100
    print(f"Accuracy: {accuracy:.2f}%")
else:
    print("No ground-truth answers provided for accuracy calculation.")

print(f"Results saved to {csv_file_path}")


Processing: 100%|██████████| 132/132 [02:42<00:00,  1.32s/it]

Accuracy: 87.10%
Results saved to Results/results.csv



