In [1]:
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
TRAIN_DATASET_PATH = "new_dataset\medicare_110k_train.json"  # Update this path
TEST_DATASET_PATH = "new_dataset/medicare_110k_test.json"  # Update this path
OUTPUT_DIR = "./qwen2.5-medical-finetuned"

In [3]:
# CUDA verification
print("üîç Checking CUDA availability...")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")
else:
    print("‚ùå CUDA not available! Training will be very slow on CPU.")

üîç Checking CUDA availability...
CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [4]:
# Load tokenizer and model
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
)

Loading model and tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


In [5]:
# Verify model is on GPU
print(f"Model device: {model.device}")
if next(model.parameters()).is_cuda:
    print("‚úÖ Model is running on GPU!")
else:
    print("‚ö†Ô∏è  Model is on CPU - training will be slow!")

Model device: cuda:0
‚úÖ Model is running on GPU!


In [6]:
# Load dataset function
def load_conversations(file_path, max_samples=None):
    """Load conversations from JSON file"""
    formatted_conversations = []

    try:
        # Try reading as JSON array first
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        print(f"Loaded {len(data)} samples from {os.path.basename(file_path)}")

        samples_to_process = data[:max_samples] if max_samples else data

        for i, conversation in enumerate(samples_to_process):
            if "Conversation" in conversation:
                conv_text = conversation["Conversation"]

                # Extract human and AI parts
                if "[|Human|]" in conv_text and "[|AI|]" in conv_text:
                    parts = conv_text.split("[|Human|]")[1]
                    human_part, ai_part = parts.split("[|AI|]")

                    human_part = human_part.strip()
                    ai_part = ai_part.strip()

                    # Format for Qwen2.5 chat
                    formatted_text = f"<|im_start|>user\n{human_part}<|im_end|>\n<|im_start|>assistant\n{ai_part}<|im_end|>"
                    formatted_conversations.append({"text": formatted_text})

            if i % 10000 == 0 and i > 0:
                print(f"Processed {i} samples...")

    except json.JSONDecodeError:
        # If JSON array fails, try reading as JSONL (one JSON per line)
        print(f"Trying JSONL format for {os.path.basename(file_path)}...")
        formatted_conversations = []
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if max_samples and i >= max_samples:
                    break

                line = line.strip()
                if line:
                    try:
                        conversation = json.loads(line)
                        if "Conversation" in conversation:
                            conv_text = conversation["Conversation"]

                            if "[|Human|]" in conv_text and "[|AI|]" in conv_text:
                                parts = conv_text.split("[|Human|]")[1]
                                human_part, ai_part = parts.split("[|AI|]")

                                human_part = human_part.strip()
                                ai_part = ai_part.strip()

                                formatted_text = f"<|im_start|>user\n{human_part}<|im_end|>\n<|im_start|>assistant\n{ai_part}<|im_end|>"
                                formatted_conversations.append({"text": formatted_text})

                        if i % 10000 == 0 and i > 0:
                            print(f"Processed {i} samples...")

                    except json.JSONDecodeError as e:
                        print(f"Skipping line {i+1}: {e}")
                        continue

    print(
        f"Successfully formatted {len(formatted_conversations)} conversations from {os.path.basename(file_path)}"
    )
    return formatted_conversations


# Load train and test datasets
print("Loading training dataset...")
train_conversations = load_conversations(
    TRAIN_DATASET_PATH, max_samples=None
)  # Use all training samples

print("Loading test dataset...")
test_conversations = load_conversations(
    TEST_DATASET_PATH, max_samples=5000
)  # Limit test samples if needed

if not train_conversations:
    print("No valid training conversations found!")
    exit()

Loading training dataset...
Trying JSONL format for medicare_110k_train.json...
Processed 10000 samples...
Processed 20000 samples...
Processed 30000 samples...
Processed 40000 samples...
Processed 50000 samples...
Processed 60000 samples...
Processed 70000 samples...
Processed 80000 samples...
Processed 90000 samples...
Processed 100000 samples...
Successfully formatted 106556 conversations from medicare_110k_train.json
Loading test dataset...
Trying JSONL format for medicare_110k_test.json...
Successfully formatted 5000 conversations from medicare_110k_test.json


In [7]:
# Create datasets
train_dataset = Dataset.from_list(train_conversations)
eval_dataset = Dataset.from_list(test_conversations) if test_conversations else None

print(f"üìö Training samples: {len(train_dataset)}")
print(f"üß™ Test samples: {len(eval_dataset) if eval_dataset else 0}")

üìö Training samples: 106556
üß™ Test samples: 5000


In [8]:
# Tokenize function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized


print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    batch_size=1000,
)

if eval_dataset:
    tokenized_eval = eval_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=eval_dataset.column_names,
        batch_size=1000,
    )
else:
    tokenized_eval = None

print("Tokenization completed!")

Tokenizing datasets...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106556/106556 [00:44<00:00, 2421.39 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [00:01<00:00, 2575.34 examples/s]

Tokenization completed!





In [9]:
# Setup LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Prepare model
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184


In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=100,
    eval_steps=1000,
    save_steps=2000,
    eval_strategy="steps" if eval_dataset else "no",
    save_strategy="steps",
    load_best_model_at_end=True if eval_dataset else False,
    report_to="none",
    dataloader_pin_memory=False,
    no_cuda= False
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
# Start training
print("Starting training...")
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Starting training...


Step,Training Loss,Validation Loss


In [None]:
# Save model
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# Test function
def generate_medical_findings(conversation_text):
    """Generate medical findings from conversation"""
    # Extract just the human part for testing
    if "[|Human|]" in conversation_text and "[|AI|]" in conversation_text:
        parts = conversation_text.split("[|Human|]")[1]
        human_part = parts.split("[|AI|]")[0].strip()
    else:
        human_part = conversation_text

    prompt = f"<|im_start|>user\nExtract essential medical findings and generate a clinical summary from this patient conversation: {human_part}<|im_end|>\n<|im_start|>assistant\n"

    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract assistant response
    if "<|im_start|>assistant" in response:
        assistant_response = response.split("<|im_start|>assistant")[-1]
        if "<|im_end|>" in assistant_response:
            assistant_response = assistant_response.split("<|im_end|>")[0]
        return assistant_response.strip()
    else:
        return response[len(prompt) :].strip()

In [None]:
# Test with sample from test dataset
print("\n" + "=" * 50)
print("TESTING FINE-TUNED MODEL")
print("=" * 50)

if test_conversations:
    test_sample = test_conversations[0]["text"]
    findings = generate_medical_findings(test_sample)
    print(f"Generated Findings:\n{findings}")
else:
    print("No test dataset available for testing")