In [1]:
!pip install --upgrade pip
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers datasets accelerate peft trl safetensors


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m6.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.0


In [2]:
import torch
import gc
import transformers, datasets, accelerate, peft, trl
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA: {torch.cuda.is_available()}, GPU: {torch.cuda.get_device_name(0)}, Mem: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


PyTorch: 2.8.0+cu126
Transformers: 4.56.1
CUDA: True, GPU: Tesla T4, Mem: 15.8 GB


In [3]:
model_name = "microsoft/DialoGPT-large"
dataset_name = "keivalya/MedQuad-MedicalQnADataset"
new_model = "DialoGPT-large-medical-chat"

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["c_attn", "c_proj", "c_fc"]
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    logging_steps=25,
    save_steps=200,
    warmup_steps=100,
    remove_unused_columns=False,
)


In [4]:
dataset = load_dataset(dataset_name, split="train").shuffle(seed=42)
train_dataset = dataset.select(range(2000))
eval_dataset = dataset.select(range(2000, 2200))
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Train: 2000, Eval: 200


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model.train()
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



trainable params: 23,592,960 || all params: 797,623,040 || trainable%: 2.9579


In [6]:
def format_and_tokenize(examples):
    if isinstance(examples['Question'], str):
        texts = [f"{examples['Question']}<|endoftext|>{examples['Answer']}<|endoftext|>"]
    else:
        texts = [f"{q}<|endoftext|>{a}<|endoftext|>" for q, a in zip(examples['Question'], examples['Answer'])]

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None
    )
    tokenized["labels"] = [ids.copy() if hasattr(ids, 'copy') else ids[:] for ids in tokenized["input_ids"]]
    return tokenized

train_dataset = train_dataset.map(format_and_tokenize, batched=True, batch_size=100, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(format_and_tokenize, batched=True, batch_size=100, remove_columns=eval_dataset.column_names)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
gc.collect()
torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [8]:
print(f"Training on {len(train_dataset)} samples")
trainer.train()
torch.cuda.empty_cache()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Training on 2000 samples


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkumataaloo[0m ([33mkumataaloo-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
25,7.7919
50,5.9467
75,4.4015
100,3.4162
125,2.9445
150,2.7445
175,2.6778
200,2.6338
225,2.5534
250,2.4156


In [9]:
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)
print(f"Model saved to: {new_model}")


Model saved to: DialoGPT-large-medical-chat


In [10]:
from peft import PeftModel

test_questions = [
    "What are the symptoms of diabetes?",
    "How is hypertension treated?",
    "What causes heart disease?",
    "What are the side effects of aspirin?",
]

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, new_model)

for i, question in enumerate(test_questions, 1):
    print(f"\n{i}. Question: {question}")
    test_input = f"{question}<|endoftext|>"
    inputs = tokenizer(test_input, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.8,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    parts = response.split("<|endoftext|>")
    answer = parts[1].strip() if len(parts) > 1 else response.replace(test_input, "").strip()
    print(f"   Answer: {answer[:150]}{'...' if len(answer) > 150 else ''}")



1. Question: What are the symptoms of diabetes?
   Answer: What are the symptoms of diabetes?The following : Symptoms of Diabetes:  - Diabetes. The body is trying to tell you something about glucose absorption...

2. Question: How is hypertension treated?
   Answer: How is hypertension treated?There are several different treatment methods. These include:  - using a steroid-drug cocktail (pED) that reduces the risk...

3. Question: What causes heart disease?
   Answer: What causes heart disease?Heart failure may occur if a person has an illness in their blood. You can have any of two types of kidney problems:  - acut...

4. Question: What are the side effects of aspirin?
   Answer: What are the side effects of aspirin?I take it for pain and fatigue. The side affects include insomnia, weight gain, swelling in my thighs after a lon...


In [11]:
%load_ext tensorboard
%tensorboard --logdir results/runs


<IPython.core.display.Javascript object>

In [12]:
from transformers import pipeline, logging
logging.set_verbosity(logging.CRITICAL)

test_prompts = [
    "What is diabetes and how is it treated?",
    "What are the symptoms of hypertension?",
    "How does aspirin work as a medication?",
    "What causes heart disease?",
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{i}. {prompt}")
    formatted_prompt = f"{prompt}<|endoftext|>"
    result = pipe(formatted_prompt)[0]['generated_text']
    answer = result.split("<|endoftext|>")[1] if "<|endoftext|>" in result else result
    print(f"   Answer: {answer.strip()[:200]}...")



1. What is diabetes and how is it treated?
   Answer: Diabetes is a disease that affects the immune system and occurs when the body tries to fight its way through the body's digestive system. It is treatable by taking insulin. The blood test and the bloo...

2. What are the symptoms of hypertension?
   Answer: What are the signs and symptoms of hypertension? The Human Phenotype Ontology provides the following list of signs and symptoms for hypertension. If the information is available, the table below inclu...

3. How does aspirin work as a medication?
   Answer: aspirin is a type of aspirin, it is an antioxidant. aspirin inhibits the production of cytokines by making the blood vessels more dilated. The medication acts as a barrier between the blood vessels an...

4. What causes heart disease?
   Answer: What causes heart disease? The exact cause of heart disease varies widely in the family. However, it is believed to lead to a variety of conditions including diabetes, obesity, heart 

In [13]:
del model, pipe, trainer
import gc
gc.collect()
torch.cuda.empty_cache()


In [14]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = peft_model.merge_and_unload()

print(" Merged LoRA weights into base model.")


 Merged LoRA weights into base model.


In [15]:
from transformers import AutoTokenizer
import locale

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

locale.getpreferredencoding = lambda: "UTF-8"

print(" Tokenizer ready for upload.")


 Tokenizer ready for upload.


In [16]:
from transformers import AutoTokenizer
import locale

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

locale.getpreferredencoding = lambda: "UTF-8"

print(" Tokenizer ready for upload.")


 Tokenizer ready for upload.


In [20]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `Ad

In [23]:
repo_name = "Adarsh123rv12/DialoGPT-large-medical-chat"  # CHANGE THIS!

merged_model.push_to_hub(repo_name, use_auth_token=True)
tokenizer.push_to_hub(repo_name, use_auth_token=True)

print(f" Uploaded to: https://huggingface.co/{repo_name}")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...al-chat/model.safetensors:   1%|1         | 16.7MB / 1.55GB            

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


 Uploaded to: https://huggingface.co/Adarsh123rv12/DialoGPT-large-medical-chat


In [24]:
from transformers import pipeline

test_pipe = pipeline(
    "text-generation",
    model=repo_name,
    tokenizer=repo_name,
    max_length=200
)

test_prompt = "What are the symptoms of diabetes?<|endoftext|>"
result = test_pipe(test_prompt)[0]['generated_text']

print(f" Sample output: {result[:300]}...")


config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/77.0 [00:00<?, ?B/s]

 Sample output: What are the symptoms of diabetes?<|endoftext|>What are the signs and symptoms of diabetes? The Human Phenotype Ontology provides the following list of signs and symptoms for diabetic syndrome. If the information is available, the table below includes how often the symptom is seen in people with thi...


In [26]:

!pip install evaluate rouge-score --quiet

import torch, math, time, os
from tqdm import tqdm
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM

rouge = evaluate.load("rouge")

tokenizer = AutoTokenizer.from_pretrained(new_model)
model = AutoModelForCausalLM.from_pretrained(
    new_model,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

sample_eval = eval_dataset.select(range(100))

references, predictions, latencies = [], [], []

total_loss = 0

for sample in tqdm(sample_eval, desc="Evaluating"):
    input_ids = sample["input_ids"].unsqueeze(0).cuda()
    labels = sample["labels"].unsqueeze(0).cuda()

    start = time.time()
    with torch.no_grad():
        loss = model(input_ids=input_ids, labels=labels).loss
        output = model.generate(input_ids, max_new_tokens=50)
    latencies.append(time.time() - start)

    pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
    ref_text = tokenizer.decode(labels[0], skip_special_tokens=True)

    predictions.append(pred_text)
    references.append(ref_text)
    total_loss += loss.item()

avg_loss = total_loss / len(predictions)
perplexity = math.exp(avg_loss)
rouge_score = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
avg_latency = sum(latencies) / len(latencies)

def get_model_size(path):
    total_bytes = sum(
        os.path.getsize(os.path.join(dp, f))
        for dp, _, fn in os.walk(path) for f in fn
    )
    return total_bytes / 1e6

model_size_mb = get_model_size(new_model)


print("\n Evaluation Results")
print("=" * 30)
print(f" Perplexity:      {perplexity:.2f}")
print(f" ROUGE-L Score:   {rouge_score['rougeL']:.4f}")
print(f" Avg Latency:     {avg_latency:.3f} sec")
print(f" Model Size:      {model_size_mb:.2f} MB")


Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating: 100%|██████████| 100/100 [04:31<00:00,  2.71s/it]



 Evaluation Results
 Perplexity:      56.38
 ROUGE-L Score:   0.9893
 Avg Latency:     2.706 sec
 Model Size:      1647.34 MB
