In [None]:
!pip install -q -U bitsandbytes accelerate transformers peft huggingface_hub tqdm pandas==2.2.2


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


DATA_PATH    = "/content/DL/train.csv"              # CheXpert dataset
INDIANA_PATH = "/content/DL/indiana_reports.csv"    # Indiana reports (few-shot)
OUTPUT_FILE  = "/content/DL/final_reports_tinyllama_fewshot_10k.csv"

START_INDEX   = 45000
END_INDEX     = 46000          # generate 10 000 samples
BATCH_SIZE    = 2              # safe for Colab T4
MAX_NEW_TOKENS = 100           # ‚âà 50 words
TEMPERATURE   = 0.6
TOP_P         = 0.9

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("‚úÖ Configuration ready.")


‚úÖ Configuration ready.


In [None]:
print(f"üîÑ Loading model: {MODEL_NAME}")

use_bnb = True
try:
    if use_bnb:
        bnb_config = BitsAndBytesConfig(load_in_4bit=True)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto"
        ).eval()
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto").eval()
except Exception as e:
    print("‚ö†Ô∏è Quantized load failed, retrying non-quantized load:", e)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto").eval()

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print("‚úÖ Model + tokenizer loaded successfully!")


üîÑ Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

‚úÖ Model + tokenizer loaded successfully!


In [None]:
chexpert_df = pd.read_csv(DATA_PATH, engine="python", on_bad_lines="skip", quoting=3)
iuxray_df   = pd.read_csv(INDIANA_PATH)

chexpert_df = chexpert_df.iloc[START_INDEX:END_INDEX].reset_index(drop=True)
print(f"‚úÖ CheXpert rows: {len(chexpert_df)} | IU-Xray rows: {len(iuxray_df)}")

label_cols = chexpert_df.columns[chexpert_df.columns.get_loc("No Finding"):].tolist()
print("üßπ Label columns:\n", label_cols)


‚úÖ CheXpert rows: 1000 | IU-Xray rows: 3851
üßπ Label columns:
 ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices']


In [None]:
def safe_format_labels(row):
    parts = []
    for col in label_cols:
        val = row.get(col)
        if pd.isna(val):
            parts.append(f"{col}: Not Mentioned")
        elif val == 1.0:
            parts.append(f"{col}: Positive")
        elif val == 0.0:
            parts.append(f"{col}: Negative")
        elif val == -1.0:
            parts.append(f"{col}: Uncertain")
        else:
            parts.append(f"{col}: Not Mentioned")
    return ", ".join(parts)

chexpert_df["formatted_labels"] = chexpert_df.apply(safe_format_labels, axis=1)
print("‚úÖ Labels formatted.")


‚úÖ Labels formatted.


In [None]:
example_uids = [1, 2, 4, 5, 6, 3995]
example_findings = []
for uid in example_uids:
    try:
        finding = iuxray_df.loc[iuxray_df["uid"] == uid, "findings"].values[0]
        example_findings.append(finding.replace('"','').strip())
    except:
        example_findings.append("No acute cardiopulmonary abnormality.")

example_labels = [
    "No Finding: Positive, Enlarged Cardiomediastinum: Negative, Cardiomegaly: Negative, Lung Opacity: Negative, Pleural Effusion: Negative",
    "No Finding: Negative, Cardiomegaly: Positive, Pleural Effusion: Positive",
    "No Finding: Negative, Lung Opacity: Positive, Edema: Uncertain",
    "No Finding: Negative, Pneumothorax: Positive, Support Devices: Positive",
    "No Finding: Negative, Cardiomegaly: Uncertain, Lung Lesion: Uncertain",
    "No Finding: Negative, Cardiomegaly: Positive, Lung Opacity: Positive, Pleural Effusion: Positive"
]

few_shot_examples = [{"labels": l, "findings": f} for l, f in zip(example_labels, example_findings)]
print(f"‚úÖ Loaded {len(few_shot_examples)} few-shot examples.")


‚úÖ Loaded 6 few-shot examples.


In [None]:
def build_fewshot_prompt(input_labels):
    example_texts = [f"Input Labels:\n{e['labels']}\nFindings:\n{e['findings']}" for e in few_shot_examples]
    examples_block = "\n\n---\n\n".join(example_texts)

    prefix = (
        "You are an expert radiologist AI. Generate only the final concise and professional 'Findings' section "
        "for a chest X-ray based solely on the structured labels below. "
        "Do not repeat labels or instructions. Output must contain only the report text. "
        "If all labels are Negative or Not Mentioned, respond exactly with 'No acute cardiopulmonary abnormality.' "
        "Limit to ‚âà 50 words.\n\nHere are examples:\n\n"
    )

    return f"{prefix}{examples_block}\n\n---\n\nInput Labels:\n{input_labels}\nFindings:"


In [None]:
import re

def clean_output(text):
    if "Findings:" in text:
        text = text.split("Findings:")[-1]
    text = re.split(r"---|Input Labels:|\nInput Labels:", text)[0]
    return text.strip().strip('"').strip()

def generate_batch(prompts):
    clean_prompts = [p.encode("utf-8", errors="ignore").decode("utf-8") for p in prompts]
    inputs = tokenizer(
        clean_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048
    ).to("cuda")

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            do_sample=True
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [clean_output(t) for t in decoded]

print("‚úÖ Generation function ready.")


‚úÖ Generation function ready.


In [None]:
test_df = chexpert_df.head(5)
test_prompts = [build_fewshot_prompt(r["formatted_labels"]) for _, r in test_df.iterrows()]
print("üß™ Testing few-shot generation on 5 samples‚Ä¶\n")

test_outputs = generate_batch(test_prompts)
for i, out in enumerate(test_outputs, 1):
    print(f"ü©∫ SAMPLE {i}:\n{out}\n{'-'*80}")


üß™ Testing few-shot generation on 5 samples‚Ä¶

ü©∫ SAMPLE 1:
Negative, No acute cardiopulmonary abnormality. There is no XXXX of a pleural effusion. There is no XXXX of a pneumothorax. There is no XXXX of a fracture. There is no XXXX of support devices. There is no XXXX of a lung lesion. There is no XXXX of a pleural effusion. There is no XXXX of a pneumothorax. There is no XX
--------------------------------------------------------------------------------
ü©∫ SAMPLE 2:
The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. There is no pneumothorax or pleural effusion. There is no acute bony findings. There is no XXXX of a pleural effusion. There is no focal scarring or atelectasis in the right midlung. There is no XXXX of a pleural effusion. There are no acute cardiopulmonary
--------------------------------------------------------------------------------
ü©∫ SAMPLE 3:
There is no acute cardiomegaly, no cardiomegaly, no lung opacities, no at

In [None]:
generated_reports = []
paths = chexpert_df.get("Path", [None]*len(chexpert_df))
total = len(chexpert_df)

print(f"üöÄ Starting full generation for {total} samples‚Ä¶")

for start in tqdm(range(0, total, BATCH_SIZE), desc="‚ö° Generating Reports"):
    end = min(start + BATCH_SIZE, total)
    batch = chexpert_df.iloc[start:end]
    prompts = [build_fewshot_prompt(r["formatted_labels"]) for _, r in batch.iterrows()]

    try:
        outs = generate_batch(prompts)
        generated_reports.extend(outs)
    except Exception as e:
        print(f"‚ö†Ô∏è Error batch {start}-{end}: {e}")
        generated_reports.extend(["Error generating report"] * len(batch))

    # üíæ Autosave every 200 samples
    if (end % 200 == 0) or (end == total):
        pd.DataFrame({
            "Path": paths[:len(generated_reports)],
            "Report_Impression": generated_reports
        }).to_csv(OUTPUT_FILE, index=False)
        print(f"üíæ Progress saved at {end}/{total}")

print(f"\nüéâ Completed {len(generated_reports)} samples!")
print(f"üìÅ Final output: {OUTPUT_FILE}")


üöÄ Starting full generation for 1000 samples‚Ä¶


‚ö° Generating Reports:  20%|‚ñà‚ñà        | 100/500 [15:31<1:02:09,  9.32s/it]

üíæ Progress saved at 200/1000


‚ö° Generating Reports:  40%|‚ñà‚ñà‚ñà‚ñà      | 200/500 [31:14<45:51,  9.17s/it]

üíæ Progress saved at 400/1000


‚ö° Generating Reports:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 300/500 [46:37<30:38,  9.19s/it]

üíæ Progress saved at 600/1000


‚ö° Generating Reports:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 400/500 [1:01:58<15:22,  9.22s/it]

üíæ Progress saved at 800/1000


‚ö° Generating Reports: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [1:17:13<00:00,  9.27s/it]

üíæ Progress saved at 1000/1000

üéâ Completed 1000 samples!
üìÅ Final output: /content/DL/final_reports_tinyllama_fewshot_10k.csv





In [None]:
out_df = pd.read_csv(OUTPUT_FILE)
print(f"‚úÖ Generated {len(out_df)} reports!\n")
print(out_df.sample(5))


‚úÖ Generated 1000 reports!

                                       Path  \
265   patient11024/study1/view2_lateral.jpg   
730  patient11139/study10/view1_frontal.jpg   
178   patient11015/study1/view1_frontal.jpg   
981   patient11198/study4/view1_frontal.jpg   
852   patient11169/study1/view1_frontal.jpg   

                                     Report_Impression  
265  There are diffuse bilateral interstitial and a...  
730  The cardiomediastinal silhouette and pulmonary...  
178  There is no evidence of acute cardiopulmonary ...  
981  The cardiomediastinal silhouette and pulmonary...  
852  There is diffuse bilateral interstitial and al...  
