In [1]:

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece
!pip install evaluate nltk rouge-score
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from huggingface_hub import login
login()

In [15]:
# pip install accelerate

from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image
import requests
import torch

model_id = "google/gemma-3-4b-it"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)


# **Overall Impression:** The image is a close-up shot of a vibrant garden scene,
# focusing on a cluster of pink cosmos flowers and a busy bumblebee.
# It has a slightly soft, natural feel, likely captured in daylight.

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# ...existing imports...
from datasets import load_dataset, DatasetDict
from PIL import Image
import os

# Few-shot examples for the assistant
few_shot_examples = [
    {
        "DocumentType": "Bank Statement",
        "Authenticity": "Fake",
        "FraudScore": 0.92,
        "Reason": "Suspicious transaction entries and mismatched dates."
    },
    {
        "DocumentType": "Salary Slip",
        "Authenticity": "Real",
        "FraudScore": 0.05,
        "Reason": "Consistent formatting and verified employer details."
    },
    {
        "DocumentType": "ID Card",
        "Authenticity": "Fake",
        "FraudScore": 0.88,
        "Reason": "Font inconsistencies and unnatural linguistic patterns."
    },
    {
        "DocumentType": "ID Card",
        "Authenticity": "Fake",
        "FraudScore": 0.95,
        "Reason": "Face Replacement and morphing"
    }
]

# System message for the assistant
system_message = """Your task is to:
    - Identify the **document type**.
    - Determine whether the document is **Real** or **Fake** based on below reasoning:
    - Suspicious or inconsistent entries.
    - Morphing and replacement indicators
    - Font inconsistencies.
    - Violations of standard banking or accounting practices.
    - Textual or numeric manipulation (e.g., formatting issues, overwritten values).
    - Metadata mismatches (e.g., conflicting dates, fake signatures/stamps).
    - Unnatural linguistic patterns or overly generic phrasing.
    - Semantic inconsistencies or hallucinated data.

Return your output in the following json format:
DocumentType: <e.g., Bank Statement, Salary Slip, ID Card>
Authenticity: <Original, Fraud, Real, Fake, Genuine>
FraudScore: <0.0 to 1.0, where 1.0 is highly likely fraud>
Reason: <Clear, concise explanation with observed issues related to authenticity>
"""

# Few-shot prompt builder
def build_fewshot_prompt():
    prompt = system_message + "\n\n"
    prompt += "Examples:\n"
    for ex in few_shot_examples:
        prompt += f"{ex}\n"
    prompt += "\nNow analyze the following document:"
    return prompt

def format_data(img):
    prompt = build_fewshot_prompt()
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": prompt}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": img,
                },
                {
                    "type": "text",
                    "text": "Identify the documentType, authenticity, fraud score, and reason.",
                },
            ],
        }
    ]


In [5]:
def execute_prompt(img,top_p,temperature):
  messages = format_data(img)
  inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
  ).to(model.device, dtype=torch.bfloat16)

  input_len = inputs["input_ids"].shape[-1]

  with torch.inference_mode():
      generation = model.generate(**inputs, max_new_tokens=1024, do_sample=True,
        top_p=top_p,
        temperature=temperature,
        disable_compile=True)
      generation = generation[0][input_len:]

  decoded = processor.decode(generation, skip_special_tokens=True)
  return decoded


In [6]:
from datasets import load_dataset, DatasetDict
eval_dataset = load_dataset("AliceRolan/IDCardDataset", split=["test"])

README.md:   0%|          | 0.00/638 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/144M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
eval_dataset[0][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1134x716>,
 'label': 0,
 'documentType': 'SlovakIDCard',
 'category': 'Real',
 'filename': 'Real_SlovakIDCard_001.png',
 'reason': 'Geniune. No Manipulation'}

In [8]:
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# Initialize the ROUGE scorer
  # 'rouge1', 'rouge2', 'rougeL' measure overlap of unigrams, bigrams,
  # and the longest common subsequence, respectively.
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Initialize the NLTK smoothing function for BLEU
chencherry = SmoothingFunction()
results_list = []
# Initialize variables to store the sum of scores for averaging
total_scores = {
    'filename': None,
    'rouge1_f': 0,
    'rouge2_f': 0,
    'rougeL_f': 0,
    'bleu': 0
}
i=0
for data in eval_dataset[0]:
  # print(data)
  output = execute_prompt(data['image'],1.0,0.3)

  llm_output_json = json.loads(output.split("json")[1].replace('`','').strip())
  import json
  from rouge_score import rouge_scorer
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

  # --- 1. Define the Ground Truth and LLM Output ---

  # The ground truth string from your example
  # ground_truth_text = "Authenticity: Real, DocumentType: IndianCurrency, Reason: Geniune. No Manipulation"

  # # The LLM's output in JSON format
  # ground_truth_text = {
  #   "DocumentType": data["documentType"],
  #   "Authenticity": data["category"],
  #   "Reason": data["reason"]
  # }

  # For a fair comparison, we'll convert the LLM's JSON output into a single string.
  # We'll concatenate the relevant values.
  ground_truth_text = (
      f"DocumentType: {data['documentType']},"
      f"Authenticity: {data['category']}, "
      f"Reason: {data['reason']}"
  )
  candidate_text = (
      f"DocumentType: {llm_output_json['DocumentType']}, "
      f"Authenticity: {llm_output_json['Authenticity']}, "
      f"Reason: {llm_output_json['Reason']}"
  )

  # print("--- Texts for Comparison ---")
  # print(f"Reference (Ground Truth): {ground_truth_text}")
  # print(f"Candidate (LLM Output): {candidate_text}\n")


  # --- 2. Calculate ROUGE Scores ---


  # Calculate scores
  rouge_scores = scorer.score(ground_truth_text, candidate_text)

  # # Add F1-scores to totals
  # total_scores['filename'] += data['filename']
  # total_scores['rouge1_f'] += rouge_scores['rouge1'].fmeasure
  # total_scores['rouge2_f'] += rouge_scores['rouge2'].fmeasure
  # total_scores['rougeL_f'] += rouge_scores['rougeL'].fmeasure

  # print("Individual ROUGE Scores:")
  # print(f"  ROUGE-1 F1: {rouge_scores['rouge1'].fmeasure:.4f}")
  # print(f"  ROUGE-2 F1: {rouge_scores['rouge2'].fmeasure:.4f}")
  # print(f"  ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
 # --- BLEU Calculation ---
  reference_tokens = [ground_truth_text.lower().split()]
  candidate_tokens = candidate_text.lower().split()

  bleu_score = sentence_bleu(
      reference_tokens,
      candidate_tokens,
      weights=(0.25, 0.25, 0.25, 0.25), # Standard BLEU-4
      smoothing_function=chencherry.method1
  )

  # total_scores['bleu'] += bleu_score
  # Store all individual results in a dictionary
  individual_results = {
      'item_id': i + 1,
      'filename': data['filename'],
      'rouge1_f': rouge_scores['rouge1'].fmeasure,
      'rouge2_f': rouge_scores['rouge2'].fmeasure,
      'rougeL_f': rouge_scores['rougeL'].fmeasure,
      'bleu': bleu_score,
      # 'ground_truth': ground_truth_text, # Optional: for context
      'llm_output': llm_output_json # Optional: for context
  }
  print("Processing completed for file",data['filename'])
  i+=1
  # Add the dictionary to our list
  results_list.append(individual_results)

print("Processing complete. Storing results in DataFrame.")

  # print(f"Individual BLEU Score: {bleu_score:.4f}")


Processing completed for file Real_SlovakIDCard_001.png
Processing completed for file Real_SlovakIDCard_002.png
Processing completed for file Real_SlovakIDCard_003.png
Processing completed for file Real_SlovakIDCard_004.png
Processing completed for file Real_SlovakIDCard_005.png
Processing completed for file Real_SlovakIDCard_006.png
Processing completed for file Real_SlovakIDCard_007.png
Processing completed for file Real_SlovakIDCard_008.png
Processing completed for file Real_SlovakIDCard_009.png
Processing completed for file Real_SlovakIDCard_010.png
Processing completed for file Real_SlovakIDCard_011.png
Processing completed for file Real_SlovakIDCard_012.png
Processing completed for file Real_SlovakIDCard_013.png
Processing completed for file Real_SlovakIDCard_014.png
Processing completed for file Real_SlovakIDCard_015.png
Processing completed for file Real_SlovakIDCard_016.png
Processing completed for file Real_SlovakIDCard_017.png
Processing completed for file Real_SlovakIDCard_

In [9]:

# --- 4. Calculate and Display Average Scores ---

num_items = len(eval_dataset)
import pandas as pd

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(results_list)
df

Unnamed: 0,item_id,filename,rouge1_f,rouge2_f,rougeL_f,bleu,llm_output
0,1,Real_SlovakIDCard_001.png,0.126984,0.000000,0.126984,0.003954,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
1,2,Real_SlovakIDCard_002.png,0.111111,0.000000,0.111111,0.003496,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
2,3,Real_SlovakIDCard_003.png,0.088235,0.000000,0.088235,0.003616,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
3,4,Real_SlovakIDCard_004.png,0.101695,0.000000,0.101695,0.004361,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
4,5,Real_SlovakIDCard_005.png,0.100000,0.000000,0.100000,0.004189,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
...,...,...,...,...,...,...,...
995,996,Fake_SlovakIDCard_496.png,0.181818,0.062500,0.181818,0.008686,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
996,997,Fake_SlovakIDCard_497.png,0.250000,0.096774,0.218750,0.012116,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
997,998,Fake_SlovakIDCard_498.png,0.179104,0.061538,0.179104,0.008361,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."
998,999,Fake_SlovakIDCard_499.png,0.254545,0.075472,0.218182,0.011047,"{'DocumentType': 'ID Card', 'Authenticity': 'F..."


In [10]:
df.to_csv("IDCardDataset-Gemma-Results.csv", index=False)