In [2]:
import os
import json
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForImageTextToText, AutoProcessor

In [3]:
question_types = ["SCN", "QLT", "INT", "LAN"]
# Define a custom dataset


class ImageQuestionDataset(Dataset):
    def __init__(self, images_path, json_path, processor):
        self.images_path = images_path
        with open(json_path, "r") as f:
            self.data = json.load(f)
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        frame_details = self.data[idx]
        frame_id = frame_details["frame_id"]
        conversations = frame_details["conversations"]
        sample = []

        # Process each question in the frame
        for converse in conversations:
            question = converse["question"]
            options = converse["options"]
            answer = converse["answer"]
            question_type = converse["question_type"]

            if question_type not in question_types:
                continue

            # Load all images for the frame
            image_folder = os.path.join(self.images_path, frame_id)
            images_list = []
            for image_file in os.listdir(image_folder):
                img = Image.open(os.path.join(
                    image_folder, image_file)).convert("RGB")
                images_list.append(img)

            # Create input data for the model
            prompt = f"<image> " * len(images_list) + \
                "<bos> " + f"{question} \n {options}"
            prompt += " Choose the Correct Option."

            sample.append({
                "images": images_list,
                "prompt": prompt,
                "answer": answer,
                "question_type": question_type
            })
        return sample

In [4]:
# Instantiate model and processor
# "google/paligemma-3b-mix-224" should be changed to other model id
model_id = "google/paligemma-3b-mix-224"
device = "cuda:5" if torch.cuda.is_available() else "cpu"
model = AutoModelForImageTextToText.from_pretrained(
    model_id, device_map=device).eval()
processor = AutoProcessor.from_pretrained(model_id)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Paths
train_images_path = "/raid/ai23mtech14006/Test_WC/maplm_v1_5/data/val/images/"
train_json_path = "/raid/ai23mtech14006/Test_WC/maplm_v1_5/data/val/val.json"

# Create dataset and dataloader
dataset = ImageQuestionDataset(train_images_path, train_json_path, processor)
dataloader = DataLoader(dataset, batch_size=16,
                        shuffle=False, collate_fn=lambda x: x)

In [6]:
import os
from tqdm import tqdm
from datetime import datetime
import pandas as pd


def predict_batch(batch, model, processor):
    predictions = []

    for b in batch:
        for sample in b:
            images = sample["images"]
            prompt = sample["prompt"]
            correct_answer = sample["answer"]

            # Prepare inputs
            inputs = processor(images=images, text=prompt, return_tensors="pt")
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            # Generate predictions
            output = model.generate(**inputs, max_new_tokens=20)
            decoded = processor.decode(
                output[0], skip_special_tokens=True).strip()

            # Extract the predicted option
            question = prompt.split("<bos>")[1][1:]

            if decoded.startswith(question):
                decoded = decoded[len(question):].strip()

            # Append results for saving to XLSX
            predictions.append({
                "Question": question,
                "Prediction": decoded,
                "Answer": correct_answer
            })
    return predictions


# Create an XLSX file
output_filename = f"palli_224_Output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
output_path = os.path.join(os.getcwd(), output_filename)
all_predictions = []

count = 0
# Iterate through DataLoader with tqdm for progress monitoring
for batch in tqdm(dataloader, desc="Processing Batches"):
    batch_predictions = predict_batch(batch, model, processor)
    all_predictions.extend(batch_predictions)

# Save all predictions to an Excel file
df = pd.DataFrame(all_predictions)
df.to_excel(output_path, index=False)

print(f"Predictions saved to {output_path}")

Processing Batches: 100%|██████████| 94/94 [1:12:30<00:00, 46.28s/it]


Predictions saved to /raid/ai23mtech14006/Test_WC/palli_224_Output_20241214_130311.xlsx


Model Specifications

Model 1 : google/paligemma-3b-mix-224 \
Parameters : 3 Billion \
Time Taken to Infer : 72 Minutes \
GPU : Tesla V100 32GB \
Weights : Pre-trained (Zero-Shot)

Model 2 : google/paligemma-3b-mix-448 \
Parameters : 3 Billion \
Time Taken to Infer : 240 Minutes \
GPU : Tesla V100 32GB \
Weights : Pre-trained (Zero-Shot)


---


# Parser Code


## Model 1 : paligemma-3b-mix-224


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

file_path = '/raid/ai23mtech14006/Test_WC/palli_224_Output_20241214_130311.xlsx'
df = pd.read_excel(file_path)

# Check if necessary columns exist
required_columns = ['Question', 'Prediction', 'Answer']
if not all(column in df.columns for column in required_columns):
    raise ValueError(
        f"DataFrame must contain the following columns: {required_columns}")

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize lists for true and predicted labels
y_true = []
y_pred = []

# Process each row in the DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    question = row['Question']
    prediction = row['Prediction'].strip().lower()

    try:
        correct_option_index = int(row['Answer'])
    except ValueError:
        print(f"Invalid answer index at row {index}")
        continue

    # Extract the options from the question
    options_start = question.find('[')
    options_end = question.find(']')

    if options_start != -1 and options_end != -1:
        options = question[options_start + 1:options_end]
        options_list = [opt.strip().strip("'").lower()
                        for opt in options.split(',')]

        # Ensure the correct_option_index is within the range of options_list
        if 0 <= correct_option_index < len(options_list):
            # Get the correct answer text using the index
            correct_answer = options_list[correct_option_index]

            # Compute semantic similarity
            embeddings = model.encode(
                [prediction, correct_answer], convert_to_tensor=True)
            similarity_score = util.cos_sim(
                embeddings[0], embeddings[1]).item()

            # Add to lists for metrics calculation
            y_true.append(correct_answer)
            y_pred.append(prediction if similarity_score >
                          0.98 else 'incorrect')
        else:
            print(f"Invalid answer index at row {index}")
    else:
        print(f"Options not found in question at row {index}")

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(
    y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

# Display metrics
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1:.2%}")

Processing rows: 100%|██████████| 6000/6000 [00:53<00:00, 112.01it/s]


Accuracy: 12.40%
Precision: 28.27%
Recall: 12.40%
F1 Score: 16.15%
