# Load

In [None]:
!pip install -U bitsandbytes
!pip install evaluate peft
!pip install rouge_score trl
!pip install -U accelerate


In [8]:
from transformers import  TrainingArguments
import torch 
import time 
import evaluate  ## for calculating rouge score
import pandas as pd
import numpy as np

## Load Model in full precesion

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

base_model="Qwen/Qwen2.5-1.5B-Instruct"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
)

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

## Load Data

In [3]:
from datasets import load_dataset

ds = load_dataset("BoghdadyJR/chatbot_medical")


README.md:   0%|          | 0.00/609 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/4.66M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor', 'question'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['Description', 'Patient', 'Doctor', 'question'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['Description', 'Patient', 'Doctor', 'question'],
        num_rows: 5000
    })
})

# Tokenize

In [6]:
def tokenize_function(example):
    # Construct the prompt using the `input` column
    
    # Tokenize input and output
    example['input_ids'] = tokenizer(example['Patient'], padding="max_length",  # Add padding
        truncation=True,       
        max_length=512,        
        return_tensors="pt").input_ids
    example['labels'] = tokenizer(example['Doctor'], padding="max_length",  # Add padding
        truncation=True,       
        max_length=512,        
        return_tensors="pt").input_ids
    return example

tokenized_datasets = ds.map(tokenize_function, batched=True,remove_columns=ds["train"].column_names)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

# Before LORA

In [9]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'


print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 1543714304
             all model parameters: 1543714304 
             percentage of trainable model parameters: 100.0 %


In [13]:
for name, module in model.named_modules():
    print(f"Module Name: {name}")

Module Name: 
Module Name: model
Module Name: model.embed_tokens
Module Name: model.layers
Module Name: model.layers.0
Module Name: model.layers.0.self_attn
Module Name: model.layers.0.self_attn.q_proj
Module Name: model.layers.0.self_attn.k_proj
Module Name: model.layers.0.self_attn.v_proj
Module Name: model.layers.0.self_attn.o_proj
Module Name: model.layers.0.self_attn.rotary_emb
Module Name: model.layers.0.mlp
Module Name: model.layers.0.mlp.gate_proj
Module Name: model.layers.0.mlp.up_proj
Module Name: model.layers.0.mlp.down_proj
Module Name: model.layers.0.mlp.act_fn
Module Name: model.layers.0.input_layernorm
Module Name: model.layers.0.post_attention_layernorm
Module Name: model.layers.1
Module Name: model.layers.1.self_attn
Module Name: model.layers.1.self_attn.q_proj
Module Name: model.layers.1.self_attn.k_proj
Module Name: model.layers.1.self_attn.v_proj
Module Name: model.layers.1.self_attn.o_proj
Module Name: model.layers.1.self_attn.rotary_emb
Module Name: model.layers.1

In [14]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(r=8,
                         lora_alpha=32, 
                         target_modules=['q_proj','k_proj'], 
                         lora_dropout = 0.05,
                         bias='none',
                         task_type=TaskType.CAUSAL_LM 
)

# After LORA

In [15]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)
peft_model = get_peft_model(model, peft_config)

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1089536
             all model parameters: 1544803840 
             percentage of trainable model parameters: 0.07052908413277896 %


# Train

In [34]:
import time
output_dir = f'./chatbot-dialogue-training-{str(int(time.time()))}'
from trl import SFTTrainer

total_training_steps = 5000 

log_steps = 100
save_steps = 500
eval_steps = 100

# TrainingArguments
args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,    
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    max_steps=3000,
    weight_decay=0.01,
    logging_steps=500,
    save_strategy='steps',
    evaluation_strategy="steps",
    report_to='none'
)




In [35]:
# First, define the metric functions as you shared
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [36]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    args=args,
    train_dataset=tokenized_datasets['train'],# Train split
    eval_dataset=tokenized_datasets['validation'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)



In [37]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
500,4.4908,0.702286
1000,0.6957,0.669651
1500,0.6716,0.661402
2000,0.675,0.656818
2500,0.6613,0.654578
3000,0.6542,0.653953


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=3000, training_loss=1.3081114501953126, metrics={'train_runtime': 14793.1525, 'train_samples_per_second': 0.811, 'train_steps_per_second': 0.203, 'total_flos': 4.8344560828416e+16, 'train_loss': 1.3081114501953126, 'epoch': 0.4})

In [46]:
trainer.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/BoghdadyJR/chatbot-dialogue-training-1734790378/commit/404ee0690dff75387b81dce21dc2f54634b913e7', commit_message='End of training', commit_description='', oid='404ee0690dff75387b81dce21dc2f54634b913e7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BoghdadyJR/chatbot-dialogue-training-1734790378', endpoint='https://huggingface.co', repo_type='model', repo_id='BoghdadyJR/chatbot-dialogue-training-1734790378'), pr_revision=None, pr_num=None)

# Evaluate

In [49]:
# Load model directly
from transformers import pipeline
generator = pipeline("text-generation", model="BoghdadyJR/chatbot-dialogue-training-1734790378", device="cuda")


Device set to use cuda


In [51]:
generator.model.push_to_hub("med")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BoghdadyJR/med/commit/2a3ccf689417653477fe03cf9f8a977727edbe3d', commit_message='Upload Qwen2ForCausalLM', commit_description='', oid='2a3ccf689417653477fe03cf9f8a977727edbe3d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BoghdadyJR/med', endpoint='https://huggingface.co', repo_type='model', repo_id='BoghdadyJR/med'), pr_revision=None, pr_num=None)

In [53]:
generator.tokenizer.push_to_hub("med")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BoghdadyJR/med/commit/6c8ce0c4f95850a279e9b08949199ef40f864164', commit_message='Upload tokenizer', commit_description='', oid='6c8ce0c4f95850a279e9b08949199ef40f864164', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BoghdadyJR/med', endpoint='https://huggingface.co', repo_type='model', repo_id='BoghdadyJR/med'), pr_revision=None, pr_num=None)

In [60]:
model=generator.model
tokenizer=generator.tokenizer

In [67]:
test_samples = ds["test"].shuffle().select(range(20))
test_samples

Dataset({
    features: ['Description', 'Patient', 'Doctor', 'question'],
    num_rows: 20
})

In [73]:
from tqdm import tqdm
import torch
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the device
model = model.to(device)

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Smoothing function for BLEU
smooth_fn = SmoothingFunction().method4

# Function to generate predictions
def generate_prediction(input_text, model, tokenizer):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device
    outputs = model.generate(**inputs, max_new_tokens=50, num_beams=5)  # Use max_new_tokens for better handling
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Perplexity calculation
def calculate_perplexity(prediction, model, tokenizer):
    inputs = tokenizer(prediction, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()

# Select 20 samples from the dataset
test_samples = ds["test"].shuffle().select(range(1000))

# Evaluation metrics
rouge_scores = []
bleu_scores = []
perplexity_scores = []

# Iterate through the selected dataset with tqdm for progress tracking
for example in tqdm(test_samples, desc="Evaluating 20 Samples"):
    patient_input = example["Patient"]
    reference = example["Doctor"]
    
    # Generate prediction
    prediction = generate_prediction(patient_input, model, tokenizer)
    
    # Calculate ROUGE, BLEU, and Perplexity
    scores = rouge.score(reference, prediction)
    rouge_scores.append(scores)
    
    reference_tokens = reference.split()
    prediction_tokens = prediction.split()
    bleu_scores.append(sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smooth_fn))
    
    perplexity_scores.append(calculate_perplexity(prediction, model, tokenizer))

# Calculate averages
average_rouge = {metric: np.mean([score[metric].fmeasure for score in rouge_scores]) for metric in ['rouge1', 'rouge2', 'rougeL']}
average_bleu = np.mean(bleu_scores)
average_perplexity = np.mean(perplexity_scores)

# Print results
print("\nEvaluation Results on 20 Samples:")
print(f"Final Average ROUGE Scores: {average_rouge}")
print(f"Final Average BLEU Score: {average_bleu}")
print(f"Final Average Perplexity: {average_perplexity}")


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Evaluating 20 Samples: 100%|██████████| 1000/1000 [35:14<00:00,  2.11s/it]


Evaluation Results on 20 Samples:
Final Average ROUGE Scores: {'rouge1': 0.2267385387469335, 'rouge2': 0.025187516285403107, 'rougeL': 0.11651081621718712}
Final Average BLEU Score: 0.13466384466782538
Final Average Perplexity: 9.996193381547927





In [74]:
print("\nEvaluation Results on 1000 Samples:")
print(f"Final Average ROUGE Scores: {average_rouge}")
print(f"Final Average BLEU Score: {average_bleu}")
print(f"Final Average Perplexity: {average_perplexity}")



Evaluation Results on 1000 Samples:
Final Average ROUGE Scores: {'rouge1': 0.2267385387469335, 'rouge2': 0.025187516285403107, 'rougeL': 0.11651081621718712}
Final Average BLEU Score: 0.13466384466782538
Final Average Perplexity: 9.996193381547927
