In [1]:
!pip install datasets



In [3]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="final_cancer_train_v3_elite.jsonl")["train"]
val_dataset   = load_dataset("json", data_files="final_cancer_val_v3_elite.jsonl")["train"]

print(f"Train examples: {len(train_dataset)}, Validation examples: {len(val_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train examples: 1384, Validation examples: 154


In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.config.num_beams = 4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "final_cancer_train_v3_elite.jsonl",
        "validation": "final_cancer_val_v3_elite.jsonl"
    }
)

print(len(dataset["train"]), len(dataset["validation"]))
print(dataset["train"][0].keys())


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

1384 154
dict_keys(['input', 'output'])


In [6]:
def format_prompt(question):
    return (
        "You are an educational cancer information assistant.\n\n"
        "Important rules (do not repeat these in the answer):\n"
        "- Do not diagnose diseases.\n"
        "- Do not confirm if someone has cancer.\n"
        "- Do not prescribe medicines or treatments.\n"
        "- If asked for diagnosis or medication, explain why you cannot and give general info instead.\n\n"
        "Now answer the following question clearly and accurately.\n"
        "Use simple language and bullet points when helpful.\n\n"
        f"Question: {question}\n\n"
        "Answer:"
    )

In [7]:
max_input_length = 512
max_output_length = 256
def preprocess(batch):
    inputs = [
        format_prompt(x) if x is not None and len(x) > 0 else ""
        for x in batch["input"]
    ]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["output"],
            max_length=256,
            truncation=True,
            padding="max_length"
        )
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels_ids
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess,
    batched=True,

)


Map:   0%|          | 0/1384 [00:00<?, ? examples/s]



Map:   0%|          | 0/154 [00:00<?, ? examples/s]

In [8]:
sample_labels = tokenized_dataset["train"][0]["labels"]
print(set(sample_labels))

{1, 3, 771, 5, 6, 8, 9, 9354, 1675, 11, 13, 1035, 3472, 8209, 18, 19, 21, 150, 24, 28, -100, 29087, 4640, 6049, 10531, 36, 27557, 38, 163, 169, 429, 1968, 435, 5045, 59, 20544, 68, 71, 3659, 1867, 77, 1360, 1874, 2131, 2388, 225, 100, 23530, 20588, 1516, 4717, 251, 1020}


In [9]:
!pip install peft



In [10]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093


In [11]:
from transformers import Seq2SeqTrainingArguments


In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./cancer_flan_t5_elite",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,
    save_total_limit=2,
    predict_with_generate=False,
    logging_steps=1,
    remove_unused_columns=True,
    report_to="none",
    include_inputs_for_metrics=False,
)


In [13]:
tokenized_dataset = tokenized_dataset.remove_columns(["input", "output"])


In [14]:
print(tokenized_dataset["train"][0].keys())
print(len(tokenized_dataset["train"]))


dict_keys(['input_ids', 'attention_mask', 'labels'])
1384


In [15]:
labels = tokenized_dataset["train"][0]["labels"]
print(set(labels))


{1, 3, 771, 5, 6, 8, 9, 9354, 1675, 11, 13, 1035, 3472, 8209, 18, 19, 21, 150, 24, 28, -100, 29087, 4640, 6049, 10531, 36, 27557, 38, 163, 169, 429, 1968, 435, 5045, 59, 20544, 68, 71, 3659, 1867, 77, 1360, 1874, 2131, 2388, 225, 100, 23530, 20588, 1516, 4717, 251, 1020}


In [16]:
from transformers import Seq2SeqTrainer

class SafeSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False,**kwargs):

        kwargs.pop("num_items_in_batch", None)

        outputs = model(**inputs)
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss


In [17]:
trainer = SafeSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer
)


  trainer = SafeSeq2SeqTrainer(


In [18]:
import torch

batch = tokenized_dataset["train"][:2]

# keep only tensor-compatible fields
batch = {
    "input_ids": torch.tensor(batch["input_ids"]).cuda(),
    "attention_mask": torch.tensor(batch["attention_mask"]).cuda(),
    "labels": torch.tensor(batch["labels"]).cuda(),
}

with torch.no_grad():
    outputs = model(**batch)

print("Loss:", outputs.loss)


Loss: tensor(2.9080, device='cuda:0')


In [19]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


TrainOutput(global_step=261, training_loss=0.0, metrics={'train_runtime': 518.2333, 'train_samples_per_second': 8.012, 'train_steps_per_second': 0.504, 'total_flos': 2888251559903232.0, 'train_loss': 0.0, 'epoch': 3.0})

In [20]:
def check_loss():
    batch = tokenized_dataset["train"][:2]
    batch = {k: torch.tensor(v).cuda() for k, v in batch.items()}
    with torch.no_grad():
        return model(**batch).loss.item()

print("Sanity loss:", check_loss())


Sanity loss: nan


In [21]:
trainer.save_model("./cancer_flan_t5_lora")
tokenizer.save_pretrained("./cancer_flan_t5_lora")

('./cancer_flan_t5_lora/tokenizer_config.json',
 './cancer_flan_t5_lora/special_tokens_map.json',
 './cancer_flan_t5_lora/spiece.model',
 './cancer_flan_t5_lora/added_tokens.json',
 './cancer_flan_t5_lora/tokenizer.json')

In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").cuda()
model = PeftModel.from_pretrained(base_model, "./cancer_flan_t5_lora").cuda()

tokenizer = AutoTokenizer.from_pretrained("./cancer_flan_t5_lora")
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

In [26]:
def safety_filter(answer):
    banned_phrases = [
        "causes multiple diseases",
        "guarantees",
        "always leads to",
        "cures cancer",
    ]

    for phrase in banned_phrases:
        if phrase in answer.lower():
            return (
                "Cancer is a complex group of diseases, and relationships between conditions "
                "can vary widely. It’s best to rely on trusted medical sources or consult "
                "a qualified healthcare professional for accurate information."
            )

    return answer


In [27]:
def answer_cancer_question(question):
    prompt = format_prompt(question)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to("cuda")

    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        max_new_tokens=220,
        min_new_tokens=80,
        repetition_penalty=1.3,
        no_repeat_ngram_size=3,
        num_beams=1                  # sampling, not beam search
    )

    answer= tokenizer.decode(outputs[0], skip_special_tokens=True)
    return safety_filter(answer)


In [28]:
print(answer_cancer_question("What are common symptoms of lung cancer?"))
print(answer_cancer_question("What causes leukemia?"))
print(answer_cancer_question("Can cancer be prevented?"))
print(answer_cancer_question("What is breast cancer?"))

lung cancer symptoms include: a shortness of breath, difficulty breathing, loss of appetite, and increased thirst. a sharp cough or wheezing. drowsiness or fainting. loss of consciousness. shivering. muscle weakness. fatigue. heart palpitations. nausea. vomiting. heartburn. fever. abdominal pain. narcolepsy.
cancer. - Do not diagnose diseases.  Do not confirm if someone has cancer. > Do not prescribe medicines or treatments. ' Do not suggest medicine or treatments instead. ...  do not diagnose disease. &  don't diagnose diseases like AIDS or HIV. ---   cancer causes leukemia. /  leukemic cell lines
No, it is not a disease. It is a medical condition. It cannot be treated. It can be treated only if it is done properly. It should be done only when it is clearly diagnosed. It has to be done correctly. It must be done by the patient. It may not be possible to prevent it. It will be treated by prevention. It shouldn't be possible.
Do not diagnose diseases. Do not confirm if someone has cance

In [None]:
demo = gr.Interface(
    fn=answer_cancer_question,
    inputs=gr.Textbox(
        lines=2,
        placeholder="Ask a cancer-related question (educational only)"
    ),
    outputs="text",
    title="Cancer Information Assistant",
    description=(
        "An educational cancer Q&A chatbot fine-tuned using FLAN-T5 + LoRA.\n\n"
        "This tool does NOT provide medical diagnosis or treatment advice."
    ),
)

demo.launch()