In [1]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard torchvision

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece


[0mNote: you may need to restart the kernel to use updated packages.
Collecting transformers>=4.51.3
  Using cached transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Using cached transformers-4.55.0-py3-none-any.whl (11.3 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.2
    Uninstalling transformers-4.48.2:
      Successfully uninstalled transformers-4.48.2
Successfully installed transformers-4.55.0
[0mNote: you may need to restart the kernel to use updated packages.
Collecting accelerate==1.4.0
  Using cached accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.4.0-py3-none-any.whl (342 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.3.0
    Uninstalling accelerate-1.3.0:
      Successfully uninstalled accelerate-1.3.0
Successfully installed accelerate-1.4.0
[0mNote: you may need to restar

In [2]:
# !pip install flash_attn

In [None]:
# from google.colab import userdata
from huggingface_hub import login
login()


In [4]:
from datasets import load_dataset, DatasetDict
from PIL import Image
import os

# System message for the assistant
system_message = """Your task is to:
    - Identify the **document type**.
    - Determine whether the document is **Real** or **Fake** based on below reasoning:
    - Suspicious or inconsistent entries.
    - Font inconsistencies.
    - Violations of standard banking or accounting practices.
    - Textual or numeric manipulation (e.g., formatting issues, overwritten values).
    - Metadata mismatches (e.g., conflicting dates, fake signatures/stamps).
    - Unnatural linguistic patterns or overly generic phrasing.
    - Semantic inconsistencies or hallucinated data.

Return your output in the following json format:
DocumentType: <e.g., Bank Statement, Salary Slip, ID Card>
Authenticity: <Original, Fraud, Real, Fake, Genuine>
Reason: <Clear, concise explanation with observed issues related to authenticity>
"""

# User prompt template
user_prompt = """Authenticity: {category}, DocumentType: {doctype}, Reason: {reason}"""

from datasets import load_dataset
from PIL import Image

def resize_half_min256(img: Image.Image) -> Image.Image:
    w, h = img.size
    new_w = max(w // 2, 256)
    new_h = max(h // 2, 256)
    if w // 2 < 256 or h // 2 < 256:
        aspect = w / h
        if w < h:
            new_w = 256
            new_h = int(256 / aspect)
        else:
            new_h = 256
            new_w = int(256 * aspect)
        new_w = min(new_w, w)
        new_h = min(new_h, h)
    return img.resize((new_w, new_h), Image.LANCZOS)

def resize_data(sample):
    img = sample["image"]
    img = resize_half_min256(img)
    sample["image"] = img
    return sample


def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": resize_data(sample)["image"],
                },
                {
                    "type": "text",
                    "text": "Idenitify the documentType and authenticity with reason",
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text",
                         "text": user_prompt.format(
                            category=sample["category"],
                            doctype=sample["documentType"],
                            reason=sample["reason"]),
                        },
                        ],
        }
    ]

def process_vision_info(messages: list[dict]) -> list[Image.Image]:
    image_inputs = []
    # Iterate through each conversation
    for msg in messages:
        # Get content (ensure it's a list)
        content = msg.get("content", [])
        if not isinstance(content, list):
            content = [content]

        # Check each content element for images
        for element in content:
            if isinstance(element, dict) and (
                "image" in element or element.get("type") == "image"
            ):
                # Get the image and convert to RGB
                if "image" in element:
                    image = element["image"]
                else:
                    image = element
                image_inputs.append(image.convert("RGB"))
    return image_inputs


In [5]:



# Load dataset from the hub
# dataset = load_dataset("AliceRolan/realfakedataset", split="train")
train_dataset, eval_dataset = load_dataset("AliceRolan/CurrencyDataset", split=["train[:100%]",  "test[:100%]"])

# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
# test_dataset = [format_data(sample) for sample in test_dataset]
# print(dataset[0]["messages"])

In [6]:
print(train_dataset[0])

[{'role': 'system', 'content': [{'type': 'text', 'text': 'Your task is to:\n    - Identify the **document type**.\n    - Determine whether the document is **Real** or **Fake** based on below reasoning:\n    - Suspicious or inconsistent entries.\n    - Font inconsistencies.\n    - Violations of standard banking or accounting practices.\n    - Textual or numeric manipulation (e.g., formatting issues, overwritten values).\n    - Metadata mismatches (e.g., conflicting dates, fake signatures/stamps).\n    - Unnatural linguistic patterns or overly generic phrasing.\n    - Semantic inconsistencies or hallucinated data.\n\nReturn your output in the following json format:\nDocumentType: <e.g., Bank Statement, Salary Slip, ID Card>\nAuthenticity: <Original, Fraud, Real, Fake, Genuine>\nReason: <Clear, concise explanation with observed issues related to authenticity>\n'}]}, {'role': 'user', 'content': [{'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=1116x507 at 0x78FEBF3ED610>}, {

In [7]:
# print(dataset[1900]["messages"])

In [8]:
import torch
# import flash_attn_2_cuda
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

# Hugging Face model id
model_id = "google/gemma-3-4b-pt" # or `google/gemma-3-12b-pt`, `google/gemma-3-27-pt`

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] < 8:
    raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch.bfloat16, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)


In [9]:
# Load model and tokenizer
model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

# import torch
# from google.colab import drive
# # import shutil

# # Mount Google Drive
# drive.mount('/content/drive')

# # Load Model with PEFT adapter
# model = AutoModelForImageTextToText.from_pretrained(
#   '/content/gemma-currency-FT',
#   device_map="auto",
#   torch_dtype=torch.bfloat16,
#   attn_implementation="eager"
# )
# processor = AutoProcessor.from_pretrained('/content/gemma-currency-FT')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=4,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)


In [11]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-currency-FT",     # directory to save and repository id
    num_train_epochs=1,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=25,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    # push_to_hub=True,                           # push model to hub
    report_to="tensorboard",                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator
)
args.remove_unused_columns = False # important for collator

# Create a data collator to encode text and image pairs
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image_inputs = process_vision_info(example)
        text = processor.apply_chat_template(
            example, add_generation_prompt=False, tokenize=False
        )
        texts.append(text.strip())
        images.append(image_inputs)

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
    labels = batch["input_ids"].clone()

    # Mask image tokens
    image_token_id = [
        processor.tokenizer.convert_tokens_to_ids(
            processor.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    # Mask tokens for not being used in the loss computation
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100

    batch["labels"] = labels
    return batch


In [12]:
!pip install evaluate nltk rouge-score
import nltk
nltk.download('punkt')

[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# import numpy as np
import traceback
import json
import evaluate


# Make sure you have these loaded
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
rouge_results = None
def compute_metrics(eval_pred):
    try:
        raw_logits, raw_labels = eval_pred
        predicted_ids = np.argmax(raw_logits[0], axis=-1)

        decoded_preds = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        labels = np.where(raw_labels != -100, raw_labels, processor.tokenizer.pad_token_id)
        decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

        def extract_assistant_response(text,tx):
            """
            Extracts content after the last 'Assistant:' marker.
            If not found, falls back to trying the 'Answer:' marker.
            """
            # print("org:",text)
            # --- FIX IS HERE: Added fallback logic ---
            # 1. Try to split by "Assistant:" first
            parts = text.rsplit("Assistant:", 1)
            if len(parts) > 1:
                # print(f"{tx}_Assistant:",parts[1].strip() if len(parts) > 1 else "")
                return parts[1].strip()
            parts = text.rsplit("assistant:", 1)
            if len(parts) > 1:
                # print(f"{tx}_assistant:",parts[1].strip() if len(parts) > 1 else "")
                return parts[1].strip()
                assistant

            # 2. If that fails, try to split by "Answer:"
            parts = text.rsplit("Answer:", 1)
            if len(parts) > 1:
                # print(f"{tx}_Answer:",parts[1].strip() if len(parts) > 1 else "")
                return parts[1].strip()

            # 3. If both fail, return an empty string
            # print(f"{tx}_orig:{text}")
            return ""
            # ----------------------------------------
        # print("decoded_preds:",decoded_preds)
        # print("decoded_labels:",decoded_labels)
        pred_responses = [extract_assistant_response(p,"pred_responses") for p in decoded_preds]
        label_responses = [extract_assistant_response(l,"label_responses") for l in decoded_labels]

        # Calculate ROUGE (it's more robust to empty strings)
        rouge_results = rouge_metric.compute(predictions=pred_responses, references=label_responses)
        # print("rouge_results:",rouge_results)
        # Tokenize for BLEU score
        pred_tokenized = [pred.split() for pred in pred_responses]
        label_tokenized = [[label.split()] for label in label_responses]

        # --- FIX IS HERE ---
        bleu_results = {"bleu": 0.0} # Default score
        # Check if there are any non-empty prediction strings to score
        if any(pred_tokenized):
            bleu_results = bleu_metric.compute(predictions=pred_tokenized, references=label_tokenized)
        # -------------------

        # -------------------
        # print("bleu_results:",bleu_results)
        all_metrics = {
            "rouge1": rouge_results["rouge1"],
            "rouge2": rouge_results["rouge2"],
            "rougeL": rouge_results["rougeL"],
            "rougeLsum": rouge_results["rougeLsum"],
            "bleu": bleu_results["bleu"],
        }
        return all_metrics

    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        traceback.print_exc()
        return {"rouge1": rouge_results["rouge1"],
            "rouge2": rouge_results["rouge2"],
            "rougeL": rouge_results["rougeL"],
            "rougeLsum": rouge_results["rougeLsum"],
                "bleu": 0}

In [14]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    # tokenizer=processor
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
    # compute_metrics=compute_metrics,
)


In [15]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
# trainer.save_model()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
25,2.7952
50,0.1102
75,0.0382
100,0.0346
125,0.0333


TrainOutput(global_step=125, training_loss=0.6023188395500183, metrics={'train_runtime': 867.1649, 'train_samples_per_second': 0.577, 'train_steps_per_second': 0.144, 'total_flos': 6297172968072000.0, 'train_loss': 0.6023188395500183})

In [16]:
import pandas as pd
metrics = trainer.state.log_history
pd.DataFrame(metrics)

Unnamed: 0,loss,grad_norm,learning_rate,mean_token_accuracy,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,2.7952,1.596012,0.0002,0.896163,0.2,25,,,,,
1,0.1102,0.74616,0.0002,0.993,0.4,50,,,,,
2,0.0382,0.336071,0.0002,0.994131,0.6,75,,,,,
3,0.0346,0.98991,0.0002,0.994864,0.8,100,,,,,
4,0.0333,0.856578,0.0002,0.995034,1.0,125,,,,,
5,,,,,1.0,125,867.1649,0.577,0.144,6297173000000000.0,0.602319


In [18]:
trainer.save_model(args.output_dir)
processor.save_pretrained(args.output_dir)

['gemma-currency-FT/processor_config.json']

In [19]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/2.76G [00:00<?, ?B/s]

events.out.tfevents.1754771048.e90b6039d39d.2368.0:   0%|          | 0.00/9.33k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/6.03k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AliceRolan/gemma-currency-FT/commit/814b00dfdd5adc463b3424b1e208eafee79d60a6', commit_message='End of training', commit_description='', oid='814b00dfdd5adc463b3424b1e208eafee79d60a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AliceRolan/gemma-currency-FT', endpoint='https://huggingface.co', repo_type='model', repo_id='AliceRolan/gemma-currency-FT'), pr_revision=None, pr_num=None)

In [20]:
# eval_results = trainer.evaluate()
# print(eval_results)

# free the memory again
# del model
# del trainer
torch.cuda.empty_cache()


In [21]:
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'peft_model' in globals(): del globals()['peft_model']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

clear_memory()

GPU allocated memory: 0.02 GB
GPU reserved memory: 4.01 GB


In [23]:
import torch, gc

gc.collect()

# Force empty cache
torch.cuda.empty_cache()

# (Optional) Reset CUDA context for the process (drastic)
torch.cuda.ipc_collect()

print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


Allocated: 0.02 GB
Reserved:  4.01 GB


In [24]:
from peft import PeftModel

# Load Model base model
model = AutoModelForImageTextToText.from_pretrained(model_id, low_cpu_mem_usage=True)

# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(model, args.output_dir)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")

processor = AutoProcessor.from_pretrained(args.output_dir)
processor.save_pretrained("merged_model")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

['merged_model/processor_config.json']

In [28]:
import os
print(os.getcwd())

/workspace


In [30]:
import torch
from peft import PeftModel
from trl import SFTConfig, SFTTrainer
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig


args = SFTConfig(
    output_dir="gemma-currency-FT",     # directory to save and repository id
    num_train_epochs=1,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=5,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                         # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    # push_to_hub=True,                           # push model to hub
    report_to="tensorboard",                    # report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator
    per_device_eval_batch_size=1, # Reduce the evaluation batch size
)
args.remove_unused_columns = False # important for collator

# Load Model with PEFT adapter and 4-bit quantization for evaluation
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = AutoModelForImageTextToText.from_pretrained(
  "/workspace/gemma-currency-FT",
  device_map="auto",
  torch_dtype=torch.bfloat16,
  attn_implementation="eager",
  quantization_config=bnb_config, # Add quantization config
)

processor = AutoProcessor.from_pretrained("/workspace/gemma-currency-FT")
# eval_dataset = load_dataset("AliceRolan/CurrencyDataset", split="test")
# eval_dataset = [format_data(sample) for sample in eval_dataset] # Apply format_data to eval_dataset


eval_trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=None,  # Optional or use validation dataset
    eval_dataset=eval_dataset,  # Use your validation dataset here
    tokenizer=processor,
    # peft_config=peft_config
    data_collator=collate_fn,
    # compute_metrics=compute_metrics,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  eval_trainer = SFTTrainer(


In [31]:
results1 = eval_trainer.evaluate()
print(results1)

{'eval_loss': 0.009441981092095375, 'eval_model_preparation_time': 0.0213, 'eval_runtime': 90.6534, 'eval_samples_per_second': 2.162, 'eval_steps_per_second': 2.162}


In [32]:
import math
print("Perplexity:", math.exp(results1["eval_loss"]))

Perplexity: 1.009486697221043


In [33]:
eval_trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/6.03k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1754774251.e90b6039d39d.2368.1:   0%|          | 0.00/420 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AliceRolan/gemma-currency-FT/commit/249b14134c3304250a40a63c83057c7d534349ff', commit_message='End of training', commit_description='', oid='249b14134c3304250a40a63c83057c7d534349ff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AliceRolan/gemma-currency-FT', endpoint='https://huggingface.co', repo_type='model', repo_id='AliceRolan/gemma-currency-FT'), pr_revision=None, pr_num=None)

In [None]:
system_message

In [36]:
def execute_prompt(img):
  messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": f"You are a helpful banking assistant and are a forensic financial analyst specializing in detecting fraud, forgery, and AI-generated content in banking documents.{system_message}"}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text", "text": "Idenitify the documentType and authenticity as Real or fake and provide reason for authenticity if found as fake and provide fraud score as fraudScore. Return your output in json format"}
        ]
    }
]

  inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
  ).to(model.device, dtype=torch.bfloat16)

  input_len = inputs["input_ids"].shape[-1]

  with torch.inference_mode():
      generation = model.generate(**inputs, max_new_tokens=1024, do_sample=True,top_p=1.0, temperature=0.5)
      generation = generation[0][input_len:]

  decoded = processor.decode(generation, skip_special_tokens=True)
  return decoded


In [37]:
decode = execute_prompt(Image.open("/workspace/IndianCurrency-Fake-2.jpg"))
print(decode)

Authenticity: Fake, DocumentType: IndianCurrency, Reason: Manipulated or edited. Missing 500 annotation or missing emblem or AI generated.


In [44]:
eval_dataset = load_dataset("AliceRolan/CurrencyDataset", split=["test"])

In [45]:
eval_dataset[0][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=852x369>,
 'label': 0,
 'documentType': 'IndianCurrency',
 'category': 'Real',
 'filename': 'Real_IndianCurrency_001.jpg',
 'reason': 'Geniune. No Manipulation'}

In [50]:
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# Initialize the ROUGE scorer
  # 'rouge1', 'rouge2', 'rougeL' measure overlap of unigrams, bigrams, 
  # and the longest common subsequence, respectively.
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Initialize the NLTK smoothing function for BLEU
chencherry = SmoothingFunction()
results_list = []
# Initialize variables to store the sum of scores for averaging
total_scores = {
    'filename': None,
    'rouge1_f': 0,
    'rouge2_f': 0,
    'rougeL_f': 0,
    'bleu': 0
}
i=0
for data in eval_dataset[0]:
  # print(data)
  output = execute_prompt(data['image'])

  # llm_output_json = output.replace('`','').strip()
  import json
  from rouge_score import rouge_scorer
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

  # --- 1. Define the Ground Truth and LLM Output ---

  # The ground truth string from your example
  # ground_truth_text = "Authenticity: Real, DocumentType: IndianCurrency, Reason: Geniune. No Manipulation"

  # # The LLM's output in JSON format
  # ground_truth_text = {
  #   "DocumentType": data["documentType"],
  #   "Authenticity": data["category"],
  #   "Reason": data["reason"]
  # }

  # For a fair comparison, we'll convert the LLM's JSON output into a single string.
  # We'll concatenate the relevant values.
  ground_truth_text = (
      f"DocumentType: {data['documentType']},"
      f"Authenticity: {data['category']}, "
      f"Reason: {data['reason']}"
  )
  candidate_text = output.replace('`','').strip()

  # print("--- Texts for Comparison ---")
  # print(f"Reference (Ground Truth): {ground_truth_text}")
  # print(f"Candidate (LLM Output): {candidate_text}\n")


  # --- 2. Calculate ROUGE Scores ---

  
  # Calculate scores
  rouge_scores = scorer.score(ground_truth_text, candidate_text)

  # # Add F1-scores to totals
  # total_scores['filename'] += data['filename']
  # total_scores['rouge1_f'] += rouge_scores['rouge1'].fmeasure
  # total_scores['rouge2_f'] += rouge_scores['rouge2'].fmeasure
  # total_scores['rougeL_f'] += rouge_scores['rougeL'].fmeasure
  
  # print("Individual ROUGE Scores:")
  # print(f"  ROUGE-1 F1: {rouge_scores['rouge1'].fmeasure:.4f}")
  # print(f"  ROUGE-2 F1: {rouge_scores['rouge2'].fmeasure:.4f}")
  # print(f"  ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
 # --- BLEU Calculation ---
  reference_tokens = [ground_truth_text.lower().split()]
  candidate_tokens = candidate_text.lower().split()
  
  bleu_score = sentence_bleu(
      reference_tokens,
      candidate_tokens,
      weights=(0.25, 0.25, 0.25, 0.25), # Standard BLEU-4
      smoothing_function=chencherry.method1
  )
  
  # total_scores['bleu'] += bleu_score
  # Store all individual results in a dictionary
  individual_results = {
      'item_id': i + 1,
      'filename': data['filename'],
      'rouge1_f': rouge_scores['rouge1'].fmeasure,
      'rouge2_f': rouge_scores['rouge2'].fmeasure,
      'rougeL_f': rouge_scores['rougeL'].fmeasure,
      'bleu': bleu_score,
      # 'ground_truth': ground_truth_text, # Optional: for context
      'llm_output': candidate_text # Optional: for context
  }
  print("Processing completed for file",data['filename'])
  i+=1
  # Add the dictionary to our list
  results_list.append(individual_results)
    
print("Processing complete. Storing results in DataFrame.")
  
  # print(f"Individual BLEU Score: {bleu_score:.4f}")


Processing completed for file Real_IndianCurrency_001.jpg
Processing completed for file Real_IndianCurrency_002.jpg
Processing completed for file Real_IndianCurrency_003.jpg
Processing completed for file Real_IndianCurrency_004.jpg
Processing completed for file Real_IndianCurrency_005.jpg
Processing completed for file Real_IndianCurrency_006.jpg
Processing completed for file Real_IndianCurrency_007.jpg
Processing completed for file Real_IndianCurrency_008.jpg
Processing completed for file Real_IndianCurrency_009.jpg
Processing completed for file Real_IndianCurrency_010.jpg
Processing completed for file Real_IndianCurrency_011.jpg
Processing completed for file Real_IndianCurrency_012.jpg
Processing completed for file Real_IndianCurrency_013.jpg
Processing completed for file Real_IndianCurrency_014.jpg
Processing completed for file Real_IndianCurrency_015.jpg
Processing completed for file Real_IndianCurrency_016.jpg
Processing completed for file Real_IndianCurrency_017.jpg
Processing com

In [51]:

# --- 4. Calculate and Display Average Scores ---

num_items = len(eval_dataset)
import pandas as pd

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(results_list)
df

Unnamed: 0,item_id,filename,rouge1_f,rouge2_f,rougeL_f,bleu,llm_output
0,1,Real_IndianCurrency_001.jpg,0.4,0.086957,0.320000,0.013679,"Authenticity: Fake, DocumentType: IndianCurren..."
1,2,Real_IndianCurrency_002.jpg,1.0,0.714286,0.750000,0.382603,"Authenticity: Real, DocumentType: IndianCurren..."
2,3,Real_IndianCurrency_003.jpg,1.0,0.714286,0.750000,0.382603,"Authenticity: Real, DocumentType: IndianCurren..."
3,4,Real_IndianCurrency_004.jpg,1.0,0.714286,0.750000,0.382603,"Authenticity: Real, DocumentType: IndianCurren..."
4,5,Real_IndianCurrency_005.jpg,0.4,0.086957,0.320000,0.013679,"Authenticity: Fake, DocumentType: IndianCurren..."
...,...,...,...,...,...,...,...
191,192,Fake_IndianCurrency_096.jpg,1.0,0.875000,0.882353,0.767307,"Authenticity: Fake, DocumentType: IndianCurren..."
192,193,Fake_IndianCurrency_097.jpg,1.0,0.875000,0.882353,0.767307,"Authenticity: Fake, DocumentType: IndianCurren..."
193,194,Fake_IndianCurrency_098.jpg,1.0,0.875000,0.882353,0.767307,"Authenticity: Fake, DocumentType: IndianCurren..."
194,195,Fake_IndianCurrency_099.jpg,1.0,0.875000,0.882353,0.767307,"Authenticity: Fake, DocumentType: IndianCurren..."


In [52]:
df.to_csv("CurrencyDataset-GemmaFT-Results.csv", index=False)