In [None]:
# ✅ Compatible pins for Google Colab (Aug 2025). Keeps Transformers v4 to avoid API changes.
# Install fsspec first to avoid dependency conflicts
!pip -q install "fsspec==2025.3.0"
!pip -q install -U   "transformers>=4.44,<5"   accelerate   "datasets>=2.19,<3"   "evaluate>=0.4,<0.5"   "rouge-score>=0.1.2,<0.2"   "bert-score>=0.3.13,<0.4"   "pandas==2.2.2"   "scikit-learn<1.7"   "pyarrow>=14,<20"

import transformers, torch, pandas as pd, importlib
print("Transformers:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())
print("pandas:", pd.__version__)
print("scikit-learn:", importlib.import_module("sklearn").__version__)
print("pyarrow:", importlib.import_module("pyarrow").__version__)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m120.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m17.9 MB/s[0m eta [36m0:

In [None]:
import os, sys, math, re, json, warnings
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate # Import evaluate
import numpy as np

print("Python:", sys.version)
if not torch.cuda.is_available():
    raise SystemExit("❌ No GPU detected. In Colab, go to Runtime → Change runtime type → Hardware accelerator → GPU, then rerun.")

gpu_name = torch.cuda.get_device_name(0)
vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
print(f"✅ Detected GPU: {gpu_name} | VRAM ≈ {vram_gb:.1f} GB")

# Recommendation note (A100 > L4 > T4); execution continues regardless.
rec = "A100 (best), then L4, then T4."
print("Recommendation:", rec)

# Silence the known T5 GenerationMixin warning (informational only)
warnings.filterwarnings("ignore", message="T5ForConditionalGeneration has generative capabilities")

Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
✅ Detected GPU: NVIDIA A100-SXM4-40GB | VRAM ≈ 39.6 GB
Recommendation: A100 (best), then L4, then T4.


In [None]:
USE_DRIVE = True  # set False to keep everything in /content
DATA_DIR  = "/content/drive/MyDrive/radiology_summarisation"
OUTPUT_DIR = "/content/drive/MyDrive/radiology_summarisation/models/flan_t5_e2e"

if USE_DRIVE:
    from google.colab import drive
    drive.mount("/content/drive")
else:
    DATA_DIR = "/content"
    OUTPUT_DIR = "/content/models/flan_t5_e2e"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("DATA_DIR:", DATA_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)

Mounted at /content/drive
DATA_DIR: /content/drive/MyDrive/radiology_summarisation
OUTPUT_DIR: /content/drive/MyDrive/radiology_summarisation/models/flan_t5_e2e


In [None]:
import os
import pandas as pd
from google.colab import files

# Set the path for the uploaded file
REPORTS_CSV_PATH = "mimic_cxr_clean.csv"  # This will be the filename after upload

# Check if the file exists, and prompt for upload if not
if not os.path.exists(REPORTS_CSV_PATH):
    print("Upload your mimic_cxr_clean.csv ...")
    uploaded = files.upload()  # This will prompt for a file upload
    for k in uploaded.keys():
        if k.lower().endswith(".csv"):
            REPORTS_CSV_PATH = k
            break

print("Using:", REPORTS_CSV_PATH)

# Try different encodings to read the file
encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
last_err = None
df = None

for enc in encodings_to_try:
    try:
        df = pd.read_csv(REPORTS_CSV_PATH, encoding=enc)
        print(f"Loaded CSV with encoding: {enc}")
        break
    except Exception as e:
        last_err = e

# If the DataFrame is not loaded, raise an error
if df is None:
    raise RuntimeError(f"Failed to load CSV at {REPORTS_CSV_PATH}. Last error: {last_err}")

# Display the columns and the first few rows
print("Columns:", df.columns.tolist())
print(df.head())

Upload your mimic_cxr_clean.csv ...


Saving mimic_cxr_clean.csv to mimic_cxr_clean.csv
Using: mimic_cxr_clean.csv
Loaded CSV with encoding: utf-8
Columns: ['input_text', 'target_summary']
                                          input_text  \
0  There is no focal consolidation, pleural effus...   
1  The cardiac, mediastinal and hilar contours ar...   
2  Single frontal view of the chest provided.\n \...   
3  The lungs are clear of focal consolidation, pl...   
4  PA and lateral views of the chest provided.   ...   

                                      target_summary  
0                  No acute cardiopulmonary process.  
1              No acute cardiopulmonary abnormality.  
2                    No acute intrathoracic process.  
3                  No acute cardiopulmonary process.  
4  Focal consolidation at the left lung base, pos...  


In [None]:
encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
df, last_err = None, None
for enc in encodings_to_try:
    try:
        df = pd.read_csv(REPORTS_CSV_PATH, encoding=enc)
        print("Loaded CSV with encoding:", enc)
        break
    except Exception as e:
        last_err = e
if df is None:
    raise RuntimeError(f"Failed to read CSV at {REPORTS_CSV_PATH}. Last error: {last_err}")

print("Columns:", list(df.columns))
display(df.head(3))

Loaded CSV with encoding: utf-8
Columns: ['input_text', 'target_summary']


Unnamed: 0,input_text,target_summary
0,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.
1,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.
2,Single frontal view of the chest provided.\n \...,No acute intrathoracic process.


In [None]:
required = {"input_text", "target_summary"}
if not required.issubset(df.columns):
    raise ValueError(f"Expected columns {required} not found. Found: {list(df.columns)}")

out = df[["input_text", "target_summary"]].rename(columns={"input_text":"findings", "target_summary":"impression"})
out = out.dropna()
out["findings"] = out["findings"].astype(str).str.strip()
out["impression"] = out["impression"].astype(str).str.strip()
out = out[(out["findings"]!="") & (out["impression"]!="")].reset_index(drop=True)

print("Resolved dataset:", out.shape)
display(out.head(3))

# Save + split
resolved_path = os.path.join(DATA_DIR, "all_resolved.csv")
out.to_csv(resolved_path, index=False)

train_df, temp_df = train_test_split(out, test_size=0.2, random_state=42)
val_df,   test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

for name, d in [("train", train_df), ("val", val_df), ("test", test_df)]:
    d.to_csv(os.path.join(DATA_DIR, f"{name}.csv"), index=False)

print("Saved splits to:", DATA_DIR)

Resolved dataset: (82038, 2)


Unnamed: 0,findings,impression
0,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.
1,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.
2,Single frontal view of the chest provided.\n \...,No acute intrathoracic process.


Saved splits to: /content/drive/MyDrive/radiology_summarisation


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
MODEL_NAME = "google/flan-t5-base"
INPUT_MAX_LEN, TARGET_MAX_LEN = 512, 128

dataset = load_dataset(
    "csv",
    data_files={
        "train": os.path.join(DATA_DIR, "train.csv"),
        "validation": os.path.join(DATA_DIR, "val.csv"),
        "test": os.path.join(DATA_DIR, "test.csv"),
    },
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    model_inputs = tokenizer(batch["findings"], max_length=INPUT_MAX_LEN, truncation=True)
    # v4-compatible target tokenization
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["impression"], max_length=TARGET_MAX_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenised = dataset.map(preprocess, batched=True, remove_columns=["findings","impression"])
tokenised

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/65630 [00:00<?, ? examples/s]



Map:   0%|          | 0/8204 [00:00<?, ? examples/s]

Map:   0%|          | 0/8204 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 65630
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8204
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8204
    })
})

In [None]:
bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()

if vram_gb >= 22:   train_bs = eval_bs = 8
elif vram_gb >= 15: train_bs = eval_bs = 4
elif vram_gb >= 10: train_bs = eval_bs = 2
else:               train_bs = eval_bs = 1

target_effective_bs = 16
grad_accum = max(1, math.ceil(target_effective_bs / train_bs))
use_bf16 = bool(bf16_ok)
use_fp16 = (not use_bf16)

# Enable gradient checkpointing for ≤15 GB VRAM
use_grad_ckpt = vram_gb <= 15

print({
    "gpu": gpu_name, "vram_gb": round(vram_gb,1),
    "train_bs": train_bs, "eval_bs": eval_bs,
    "grad_accum": grad_accum, "bf16": use_bf16, "fp16": use_fp16,
    "grad_checkpointing": use_grad_ckpt
})

{'gpu': 'NVIDIA A100-SXM4-40GB', 'vram_gb': 39.6, 'train_bs': 8, 'eval_bs': 8, 'grad_accum': 2, 'bf16': True, 'fp16': False, 'grad_checkpointing': False}


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
if use_grad_ckpt:
    model.gradient_checkpointing_enable()
    print("Enabled gradient checkpointing.")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    # evaluation_strategy="steps", # Added back evaluation_strategy # Removed evaluation_strategy
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=eval_bs,
    gradient_accumulation_steps=grad_accum,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    bf16=use_bf16,
    fp16=use_fp16,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
print("✅ Training complete. Model saved to:", OUTPUT_DIR)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,2.1443
200,1.7984
300,1.5856
400,1.5804
500,1.5023
600,1.4945
700,1.4298
800,1.3729
900,1.3607
1000,1.3212


Step,Training Loss
100,2.1443
200,1.7984
300,1.5856
400,1.5804
500,1.5023
600,1.4945
700,1.4298
800,1.3729
900,1.3607
1000,1.3212


✅ Training complete. Model saved to: /content/drive/MyDrive/radiology_summarisation/models/flan_t5_e2e


In [None]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def generate_texts(max_samples=256):
    idx = list(range(len(dataset["test"])))[:max_samples]
    preds, refs = [], []
    for i in idx:
        refs.append(dataset["test"][i]["impression"])
        inputs = tokenizer(dataset["test"][i]["findings"], return_tensors="pt", truncation=True, max_length=INPUT_MAX_LEN).to(model.device)
        with torch.no_grad():
            output = model.generate(**inputs, max_length=TARGET_MAX_LEN, num_beams=4)
        preds.append(tokenizer.decode(output[0], skip_special_tokens=True))
    return preds, refs

preds, refs = generate_texts()
rouge_scores = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
bert_scores = bertscore.compute(predictions=preds, references=refs, lang="en")

print("ROUGE:", {k: round(v,4) for k,v in rouge_scores.items()})
print("BERTScore F1 (mean):", float(np.mean(bert_scores["f1"])))

# Save metrics
with open(os.path.join(OUTPUT_DIR, "metrics.json"), "w") as f:
    json.dump({"rouge": rouge_scores, "bertscore_f1_mean": float(np.mean(bert_scores["f1"]))}, f, indent=2)
print("Saved metrics to:", os.path.join(OUTPUT_DIR, "metrics.json"))

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE: {'rouge1': np.float64(0.5589), 'rouge2': np.float64(0.449), 'rougeL': np.float64(0.5319), 'rougeLsum': np.float64(0.5442)}
BERTScore F1 (mean): 0.9177442560903728
Saved metrics to: /content/drive/MyDrive/radiology_summarisation/models/flan_t5_e2e/metrics.json


In [None]:
N = min(10, len(dataset["test"]))
import pandas as pd
samples = pd.DataFrame({
    "findings": [dataset["test"][i]["findings"] for i in range(N)],
    "reference_impression": [dataset["test"][i]["impression"] for i in range(N)],
    "model_summary": preds[:N],
})
samples

Unnamed: 0,findings,reference_impression,model_summary
0,"The heart size, mediastinal, and hilar contour...",New hyperdensity overlying the anterior right ...,No evidence of acute cardiopulmonary process. ...
1,There is streaky density bilaterally consisten...,Line placement as described. Subsegmental ate...,1. Subsegmental atelectasis. 2. Density in the...
2,Postsurgical changes include chain sutures alo...,No acute cardiopulmonary pathology. Postsurgic...,1. Postsurgical changes in the right upper med...
3,The lungs are clear without focal consolidatio...,No acute cardiopulmonary process. No displace...,No acute cardiopulmonary process.
4,"Cardiac, mediastinal and hilar contours are no...",No acute cardiopulmonary process.,No acute cardiopulmonary abnormality. Emphysema.
5,PA and lateral chest radiograph demonstrates c...,No acute intrathoracic abnormality.,No acute intrathoracic abnormality.
6,Pulmonary edema since ___ radiograph has impro...,Improved pulmonary edema as compared to radiog...,1. Interval improvement in pulmonary edema sin...
7,Portable upright chest radiograph ___ at 15:23...,The heart remains markedly enlarged. There ha...,"Right subclavian PICC line, right subclavian P..."
8,Portable semi-erect chest film ___ at 22:08 is...,Interval placement of bilateral pigtail pleura...,Interval removal of right internal jugular cen...
9,The CHF findings may be slightly improved. Oth...,As above..,The CHF findings may be slightly improved. Oth...


In [None]:
pred_path = os.path.join(OUTPUT_DIR, "test_predictions.csv")
pd.DataFrame({"prediction": preds, "reference": refs}).to_csv(pred_path, index=False)
print("Saved predictions to:", pred_path)

Saved predictions to: /content/drive/MyDrive/radiology_summarisation/models/flan_t5_e2e/test_predictions.csv
