<a href="https://colab.research.google.com/github/Agniva2006/llm_fine_tuning-google-flan-t5-base-220M-params-/blob/main/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install transformers datasets peft accelerate bitsandbytes sentencepiece gradio



In [34]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="cancer_train.jsonl")["train"]
val_dataset   = load_dataset("json", data_files="cancer_val.jsonl")["train"]

print(f"Train examples: {len(train_dataset)}, Validation examples: {len(val_dataset)}")

Train examples: 1234, Validation examples: 138


In [35]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [36]:
max_input_length = 128
max_output_length = 512

def preprocess(example):
    input_enc = tokenizer(example["instruction"], max_length=max_input_length, truncation=True, padding="max_length")
    output_enc = tokenizer(example["output"], max_length=max_output_length, truncation=True, padding="max_length")
    return {"input_ids": input_enc["input_ids"], "attention_mask": input_enc["attention_mask"], "labels": output_enc["input_ids"]}

train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/1234 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [37]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.025,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

In [38]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./cancer_lora_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    num_train_epochs=3,
    logging_steps=50,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0


TrainOutput(global_step=465, training_loss=0.0, metrics={'train_runtime': 314.0037, 'train_samples_per_second': 11.79, 'train_steps_per_second': 1.481, 'total_flos': 638773779234816.0, 'train_loss': 0.0, 'epoch': 3.0})

In [39]:
model.save_pretrained("./cancer_lora_model")
tokenizer.save_pretrained("./cancer_lora_model")


('./cancer_lora_model/tokenizer_config.json',
 './cancer_lora_model/special_tokens_map.json',
 './cancer_lora_model/spiece.model',
 './cancer_lora_model/added_tokens.json',
 './cancer_lora_model/tokenizer.json')

In [40]:
import gradio as gr

def answer_question(question):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

demo = gr.Interface(fn=answer_question, inputs="text", outputs="text", title="Cancer Q&A Chatbot")
demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://602fa54d0436a4b632.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [41]:
!head cancer_train.jsonl


{"instruction": "Delays in time to primary treatment after a diagnosis of breast cancer: does it impact survival?", "input": "", "output": "Time to primary treatment after a diagnosis of breast cancer had no impact on overall survival. Further studies on care before diagnosis are important in drawing up meaningful quality indicators."}
{"instruction": "Regardless of treatment outcome, many CRC patients experience ongoing symptoms of anxiety, depression and distress .", "input": "", "output": "Regardless of treatment outcome, many CRC patients experience ongoing symptoms of anxiety, depression, and distress."}
{"instruction": "What does the following text explain about Pancreatic cancer?", "input": "", "output": "The discovery in the late 1930s that vitamin K prevented bleeding with jaundice, and the development of blood transfusion as an everyday process, both improved post-operative survival, but about 25% of people never left hospital alive as late as the 1970s. In the 1970s a group 