In [None]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard torchvision

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece

In [None]:
#Mount the notebook on to the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the working directory to dl_project_fall_2025
import os
os.chdir("/content/drive/MyDrive/DL_Project_2025/dl_project_fall_2025")

# Auto relaod doesnt work in google colab, so you can use reload to reload your function calls
from importlib import reload

In [None]:
# Importing git token and huggig face tokens
from google.colab import userdata
from huggingface_hub import login

# Retrieve the GitHub Token from Colab secrets
GH_TOKEN = userdata.get('git_token') # Ensure you stored your PAT under the secret name 'GH_TOKEN'
hf_token = userdata.get('hugging_face')
login(hf_token)

# Configure Git to use the PAT directly in the remote URL for the 'origin'
!git remote set-url origin https://{GH_TOKEN}@github.com/7yashwanth7/dl_project_fall_2025.git
!git config --global user.email "7yashwanth7@gmail.com" # Modify to your username and pwd
!git config --global user.name "7yashwanth7"

In [None]:
from src.llmft.data_preprocessing import preprocess_utils

defaults = preprocess_utils.read_yaml('src/llmft/config/defaults.yaml')
cui_mapping_json = preprocess_utils.read_json('mapping_files/cui_mapping.json')
cui_mapping = preprocess_utils.get_cui_mapping(cui_mapping_json)


In [None]:
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset

base_model_id = "google/gemma-3-4b-pt"
processor_id  = "google/gemma-3-4b-it"          # <-- IMPORTANT: same as training
adapter_dir   = "gemma-3-cui-finetuned-sample1"

# 1) Load processor (IT) and tokenizer
processor = AutoProcessor.from_pretrained(processor_id)
tokenizer = processor.tokenizer

tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2) Re-add CUI tokens exactly like during training
#    (you used plain codes like "C0041618", not "")
cui_tokens = list(cui_mapping.keys())
num_added = tokenizer.add_tokens(cui_tokens)
print("Added", num_added, "CUI tokens")

processor.tokenizer = tokenizer

# 3) Load base model (PT) with 4-bit quant
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,
    ),
)

model = AutoModelForImageTextToText.from_pretrained(
    base_model_id,
    **model_kwargs,
)

# 4) Resize embeddings to match tokenizer (base vocab + CUI tokens)
model.resize_token_embeddings(len(tokenizer))

# 5) Attach LoRA adapter you trained
model = PeftModel.from_pretrained(model, adapter_dir)
model.eval()

In [None]:
from datasets import load_dataset

#9927 samples
dataset_specs = [
    {"name": "eltorio/ROCOv2-radiology", "split": "test", "max_samples": 100},
]

datasets = {}
for spec in dataset_specs:
    ds = load_dataset(spec["name"], split=spec["split"])
    if spec.get("max_samples"):
        ds = ds.select(range(spec["max_samples"]))
    ds = ds.with_format("python")  # keep PIL images
    datasets[spec["name"]] = ds
    print(f"{spec['name']}[{spec['split']}] -> {len(ds)} samples")


## Batched scoring and saving to Google Drive

These cells batch inference across multiple datasets and save JSONL outputs to Google Drive (or locally if Drive is not available).


In [None]:
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence

import torch
from tqdm.auto import tqdm

system_message = "You are a digital radiologist who can understand the medical scan of images code the concepts and provide captions"
user_prompt = """Create a description based on the provided image and return the description of the image with details of the scan as captions, the concepts and their descriptions, only the concepts that are extracted"""

def build_messages(sample, system_message, user_prompt):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image"}
            ],
        },
    ]


def prepare_batch(samples, processor, system_message, user_prompt):
    messages = [build_messages(s, system_message, user_prompt) for s in samples]

    chat_texts = [
        processor.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
        for m in messages
    ]
    images = [[s["image"]] for s in samples]

    # Optional sanity check
    assert len(chat_texts) == len(images), (len(chat_texts), len(images))

    inputs = processor(
        text=chat_texts,
        images=images,
        return_tensors="pt",
        padding=True,
    )
    return inputs

def decode_generations(
    generated_ids: torch.Tensor,
    prompt_input_ids: torch.Tensor,
    tokenizer):

    """Decode only the generated tokens (strip the prompt)."""
    prompt_len = prompt_input_ids.shape[1]
    gen_only = generated_ids[:, prompt_len:]
    return tokenizer.batch_decode(gen_only, skip_special_tokens=True)

def score_dataset(
    dataset: Any,
    model,
    processor,
    system_message: str,
    user_prompt: str,
    batch_size: int = 4,
    max_samples: Optional[int] = None,
    gen_kwargs: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
    """Generate outputs for a dataset in batches."""
    gen_kwargs = gen_kwargs or {}
    total = len(dataset)
    if max_samples is not None:
        total = min(total, max_samples)

    results: List[Dict[str, Any]] = []
    model.eval()

    for start in tqdm(range(0, total, batch_size), desc="Scoring", leave=False):
        end = min(start + batch_size, total)
        batch = [dataset[i] for i in range(start, end)]

        inputs = prepare_batch(batch, processor, system_message, user_prompt)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            generated_ids = model.generate(**inputs, **gen_kwargs)

        decoded = decode_generations(generated_ids, inputs["input_ids"], processor.tokenizer)

        for sample, output_text in zip(batch, decoded):
            results.append(
                {
                    "id": sample.get("image_id"),
                    "caption": sample.get("caption"),
                    "cui": sample.get("cui"),
                    "generation": output_text.strip(),
                }
            )

    return results


def save_jsonl(records: Iterable[Dict[str, Any]], path: Path) -> None:
    """Save a list of dicts to a JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec) + "\n")


def score_batch_dataset(
    datasets: Dict[str, Any],
    model,
    processor,
    system_message: str,
    user_prompt: str,
    batch_size: int = 4,
    max_samples: Optional[int] = None,
    gen_kwargs: Optional[Dict[str, Any]] = None,
) -> Dict[str, Path]:
    """Score multiple datasets and save results per dataset."""
    paths: Dict[str, Path] = {}
    for name, ds in datasets.items():
        results = score_dataset(
            dataset=ds,
            model=model,
            processor=processor,
            system_message=system_message,
            user_prompt=user_prompt,
            batch_size=batch_size,
            max_samples=max_samples,
            gen_kwargs=gen_kwargs,
        )
        # save_jsonl(results, output_dir)
        # paths[name] = output_dir
    return results

In [None]:
# Generation and batching settings
batch_size = 8
max_samples = None  # set to an int to cap samples per dataset

gen_kwargs = dict(
    max_new_tokens=128,
    do_sample=False
)

#Results
results = score_batch_dataset(
    datasets=datasets,
    model=model,
    processor=processor,
    system_message=system_message,
    user_prompt=user_prompt,
    batch_size=batch_size,
    max_samples=max_samples,
    gen_kwargs=gen_kwargs,
)

# Save Results

In [None]:
Experiment_Name = "4Bit_Qunat_Gemma_YL_test_trained_model_check"

In [None]:
#Save results
output_folder = f"/content/drive/MyDrive/DL_Project_2025/Score_Results"
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, f"{Experiment_Name}.jsonl")

# Dump it to Json
with open(output_path, 'w', encoding='utf-8') as f:
    for entry in results:
        # json.dump converts the dictionary to a text string
        f.write(json.dumps(entry) + "\n")

# Evaluate Results

In [None]:
import pandas as pd
df = pd.read_json(output_path, lines=True)

In [None]:
import re
import pandas as pd

# Robust patterns
CAPTION_RE = re.compile(
    r"Caption:\s*(.*?)\s*(?:\nConcept descriptions:|\nConcepts:|$)",
    flags=re.S
)
CONCEPTS_RE = re.compile(
    r"Concepts:\s*(.*)\s*$",
    flags=re.S
)

def extract_caption_and_concepts(text: str):
    text = (text or "").strip()

    # Caption
    m_cap = CAPTION_RE.search(text)
    caption = m_cap.group(1).strip() if m_cap else ""

    # Concepts (as list)
    m_con = CONCEPTS_RE.search(text)
    concepts_str = m_con.group(1).strip() if m_con else ""

    # split by comma if multiple concepts
    concepts = [c.strip() for c in concepts_str.split(",") if c.strip()] if concepts_str else []

    return pd.Series({"caption_extracted": caption, "concepts_list": concepts})

In [None]:
df[["caption_extracted", "concepts_extracted"]] = df["generation"].apply(extract_caption_and_concepts)