<h2><span style="color:blue; font-weight:bold;">T5 Small Model</span></h2>


In [None]:
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
import pandas as pd
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from evaluate import load
import torch

with open("QTL_text.json") as f:
    raw = json.load(f)
df = pd.DataFrame(raw)

df = df.drop(columns=["PMID", "Journal", "Category"], errors="ignore")
df = df.dropna(subset=["Abstract", "Title"]).reset_index(drop=True)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

datasets = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
    }
)

model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

max_input_len = 256
max_target_len = 64

tokenized = datasets.map(
    lambda batch: {
        **tokenizer(
            [f"generate title: {t}" for t in batch["Abstract"]],
            max_length=max_input_len,
            truncation=True,
            padding="max_length",
        ),
        "labels": tokenizer(
            batch["Title"],
            max_length=max_target_len,
            truncation=True,
            padding="max_length",
        )["input_ids"],
    },
    batched=True,
    remove_columns=["Abstract", "Title"],
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="t5_small_title_gen",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=4,
    predict_with_generate=True,
    generation_max_length=max_target_len,
    generation_num_beams=8,
    fp16=True,
    dataloader_num_workers=4,
    save_total_limit=2,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

val_abstracts = val_df["Abstract"].fillna("").tolist()
val_inputs = [f"generate title: {a}" for a in val_abstracts]

all_val_preds = []
batch_size = 8

for i in range(0, len(val_inputs), batch_size):
    batch = val_inputs[i : i + batch_size]
    enc = tokenizer(
        batch,
        max_length=max_input_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        out = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_length=max_target_len,
            num_beams=8,
        )

    preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    all_val_preds.extend(preds)

val_refs = val_df["Title"].tolist()

bleu = load("bleu")
val_bleu = bleu.compute(
    predictions=all_val_preds,
    references=[[r] for r in val_refs],
)["bleu"] * 100
print(f"\nDev BLEU: {val_bleu:.2f}")

rouge = load("rouge")
val_rouge = rouge.compute(
    predictions=all_val_preds,
    references=val_refs,
    rouge_types=["rouge2", "rougeL"],
)
print(f"Dev ROUGE-2: {val_rouge['rouge2']:.4f}")
print(f"Dev ROUGE-L: {val_rouge['rougeL']:.4f}")

test_df = pd.read_csv("QTL_test_labeled.tsv", sep="\t")
test_abstracts = test_df["Abstract"].fillna("").tolist()
test_inputs = [f"generate title: {a}" for a in test_abstracts]

test_preds = []

for i in range(0, len(test_inputs), batch_size):
    batch = test_inputs[i : i + batch_size]
    enc = tokenizer(
        batch,
        max_length=max_input_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        out = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_length=max_target_len,
            num_beams=8,
        )

    preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    test_preds.extend(preds)

test_df["Predicted_Title"] = test_preds

test_refs = test_df["Title"].fillna("").tolist()

test_bleu = bleu.compute(
    predictions=test_preds,
    references=[[r] for r in test_refs],
)["bleu"] * 100
print(f"\nTEST BLEU: {test_bleu:.2f}")

test_rouge = rouge.compute(
    predictions=test_preds,
    references=test_refs,
    rouge_types=["rouge2", "rougeL"],
)
print(f"TEST ROUGE-2: {test_rouge['rouge2']:.4f}")
print(f"TEST ROUGE-L: {test_rouge['rougeL']:.4f}")

test_df.to_csv("QTL_test_with_predicted_titles.tsv", sep="\t", index=False)
print("Test predictions saved to QTL_test_with_predicted_titles.tsv")

<h2><span style="color:blue; font-weight:bold;">Mistral 7B</span></h2>

In [None]:
import json
import random
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import evaluate

# basic config
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
TRAIN_FILE = "QTL_text.json"
TEST_FILE  = "QTL_test_labeled.tsv"

DEV_OUTPUT_FILE  = "mistral_dev_zero_shot_10shot.tsv"
TEST_OUTPUT_FILE = "mistral_test_zero_shot_10shot.tsv"

MAX_GEN_LEN = 32
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

USE_CPU_ONLY = False

# load train data
with open(TRAIN_FILE) as f:
    raw = json.load(f)

df = pd.DataFrame(raw)
df = df.drop(columns=["PMID", "Journal", "Category"], errors="ignore")
df = df.dropna(subset=["Abstract", "Title"]).reset_index(drop=True)

train_df, dev_df = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
dev_data = dev_df.to_dict(orient="records")
print(f"DEV size: {len(dev_data)}")

# few-shot examples
n_shots = min(10, len(train_df))
few_shot_examples = train_df.sample(n=n_shots, random_state=RANDOM_SEED)[["Abstract", "Title"]].to_dict(orient="records")
print(f"Using {n_shots} examples in the prompt.")

# load test data
test_df_full = pd.read_csv(TEST_FILE, sep="\t", header=0)
test_data = test_df_full.to_dict(orient="records")
print(f"TEST size: {len(test_data)}")

# load model + tokenizer
print("Loading Mistral-7B (4-bit)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if USE_CPU_ONLY or not torch.cuda.is_available():
    device = torch.device("cpu")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
    print("Model on CPU.")
else:
    device = torch.device("cuda")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )
    print("Model on GPU:", model.device)


def truncate_text(text, max_chars):
    text = str(text).replace("\n", " ")
    return text[:max_chars].strip() if len(text) > max_chars else text.strip()


def build_prompt(abstract):
    parts = []
    parts.append(
        "You are an expert biomedical editor. "
        "Given an abstract, generate a concise scientific title similar to the examples.\n"
        "Use around 10â€“18 words.\n"
    )

    for i, ex in enumerate(few_shot_examples, start=1):
        ex_abs = truncate_text(ex["Abstract"], 250)
        ex_title = truncate_text(ex["Title"], 120)
        parts.append(
            f"\nExample {i}\n"
            f"Abstract: {ex_abs}\n"
            f"Title: {ex_title}\n"
        )

    cur_abs = truncate_text(abstract, 300)
    parts.append(
        "\nNow generate a title.\n"
        f"Abstract: {cur_abs}\n"
        "Title:"
    )

    return "\n".join(parts)


def clean_title(text):
    text = text.strip()
    if text.lower().startswith("title:"):
        text = text[6:].strip()
    text = text.split("\n")[0].strip()
    text = text.replace('"', "").rstrip(".").strip()
    return text


def generate_title(abstract):
    prompt = build_prompt(abstract)
    enc = tokenizer(prompt, truncation=True, return_tensors="pt").to(model.device)
    input_len = enc["input_ids"].shape[1]

    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=MAX_GEN_LEN,
            num_beams=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    new_tokens = out[0][input_len:]
    if new_tokens.numel() > 0:
        raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
    else:
        raw = tokenizer.decode(out[0], skip_special_tokens=True)

    return clean_title(raw)


bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


def show_metrics(preds, refs, name):
    bleu = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
    rouge = rouge_metric.compute(predictions=preds, references=refs)
    print(f"\n=== {name} ===")
    print(f"BLEU    : {bleu['bleu']:.4f}")
    print(f"ROUGE-2 : {rouge['rouge2']:.4f}")
    print(f"ROUGE-L : {rouge['rougeL']:.4f}")


print("\nGenerating DEV titles...")
dev_rows = []
for i, d in enumerate(dev_data, start=1):
    gen = generate_title(d["Abstract"])
    dev_rows.append({
        "Abstract": d["Abstract"],
        "Title": d["Title"],
        "GeneratedTitle": gen
    })
    if i % 20 == 0 or i == len(dev_data):
        print(f"DEV {i}/{len(dev_data)}")

dev_df_out = pd.DataFrame(dev_rows)
dev_df_out.to_csv(DEV_OUTPUT_FILE, sep="\t", index=False)
show_metrics(dev_df_out["GeneratedTitle"].tolist(), dev_df_out["Title"].tolist(), "DEV (10-shot, full)")


print("\nGenerating TEST titles...")
test_rows = []
for i, d in enumerate(test_data, start=1):
    gen = generate_title(d["Abstract"])
    test_rows.append({
        "Abstract": d["Abstract"],
        "Title": d["Title"],
        "GeneratedTitle": gen
    })
    if i % 20 == 0 or i == len(test_data):
        print(f"TEST {i}/{len(test_data)}")

test_df_out = pd.DataFrame(test_rows)
test_df_out.to_csv(TEST_OUTPUT_FILE, sep="\t", index=False)
show_metrics(test_df_out["GeneratedTitle"].tolist(), test_df_out["Title"].tolist(), "TEST (10-shot, full)")