In [1]:
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch
from peft import LoraConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, TaskType


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
# -------------------------------------------------------------------
# LOAD DATA
# -------------------------------------------------------------------
train_df = pd.read_csv("medical_cases_train/medical_cases_train.csv")
val_df = pd.read_csv("medical_cases_validation/medical_cases_validation.csv")
test_df = pd.read_csv("medical_cases_test/medical_cases_test.csv")

train_set = Dataset.from_pandas(train_df)
val_set = Dataset.from_pandas(val_df)
test_set = Dataset.from_pandas(test_df)

In [3]:
torch.cuda.empty_cache()

In [4]:
def format_prompt(example):
    return {
        "text": f"<start_of_turn>user\nDescription:{example['description']}<end_of_turn> \
        \n<start_of_turn>model\n{example['medical_specialty']}<end_of_turn>"
    }

train_dataset = train_set.map(format_prompt)
val_dataset = val_set.map(format_prompt)
test_dataset = test_set.map(format_prompt)

# -------------------------------------------------------------------
# 3. LOAD MODEL & TOKENIZER (from Hugging Face Transformers)
# -------------------------------------------------------------------
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False  # <-- This helps avoid TypeError in some cases
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    load_in_4bit=True,
    device_map="auto"
)
# -------------------------------------------------------------------
# 4. APPLY LoRA using PEFT
# -------------------------------------------------------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

# -------------------------------------------------------------------
# 5. TOKENIZATION
# -------------------------------------------------------------------
def tokenize(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

train_dataset = train_dataset.map(tokenize, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(tokenize, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(tokenize, remove_columns=test_dataset.column_names)

# -------------------------------------------------------------------
# 6. TRAINING ARGUMENTS
# -------------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./deepseek-lora-medical",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    num_train_epochs=6,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

# -------------------------------------------------------------------
# 7. TRAINER
# -------------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()
# -------------------------------------------------------------------
# 9. SAVE MODEL
# -------------------------------------------------------------------
model.save_pretrained("./deepseek-lora-medical")
tokenizer.save_pretrained("./deepseek-lora-medical")


Map:   0%|          | 0/1724 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Map:   0%|          | 0/1724 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,6.1168
20,1.0872
30,0.6106
40,0.5011
50,0.3799
60,0.3523
70,0.2905
80,0.2688
90,0.266
100,0.2913



Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B.

Internal Error - We're working hard to fix this as soon as possible! - silently ignoring the lookup for the file config.json in deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B.


('./deepseek-lora-medical/tokenizer_config.json',
 './deepseek-lora-medical/special_tokens_map.json',
 './deepseek-lora-medical/tokenizer.json')

In [5]:
# -------------------------------------------------------------------
# SETUP
# -------------------------------------------------------------------
target_classes = sorted(np.unique(test_df["medical_specialty"]))
target_classes_str = "\n".join(target_classes)

model.eval()

y_pt = []
y_gt = []

# Clear logs
open("deepseek.txt", "w").close()
open("deepseek_unknown.txt", "w").close()

print("\n=== Predictions on Test Set ===\n")

# -------------------------------------------------------------------
# MATCHING FUNCTION
# -------------------------------------------------------------------
def match_class(prediction_raw, target_classes):
    pred = prediction_raw.lower().strip()

    # Exact match
    for cls in target_classes:
        if pred == cls.lower():
            return cls

    # Substring match
    for cls in target_classes:
        if cls.lower() in pred:
            return cls

    # Word overlap
    pred_words = set(pred.split())
    for cls in target_classes:
        cls_words = set(cls.lower().split())
        if pred_words & cls_words:
            return cls

    return "Unknown"

# -------------------------------------------------------------------
# INFERENCE LOOP
# -------------------------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

for i in tqdm(range(len(test_df))):
    true_label = test_df.iloc[i]["medical_specialty"]
    description = test_df.iloc[i]["description"]

    prompt = f"""Classify the following medical case description into one of the following medical specialties.

Respond with only the name of the specialty. One-word answer. No explanations.

Choices:
{target_classes_str}

Description:
{description}

Medical Specialty:"""

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=20,
            do_sample=False
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prediction_raw = decoded.split("Medical Specialty:")[-1].strip()

    matched_class = match_class(prediction_raw, target_classes)

    if matched_class == "Unknown":
        with open("deepseek_unknown.txt", "a") as f:
            f.write(f"[Unknown] Raw prediction: {prediction_raw}\nDescription: {description}\n\n")

    y_pt.append(matched_class)
    y_gt.append(true_label)

    with open("deepseek.txt", "a") as f:
        f.write(f"Prediction: {matched_class}\n")
        f.write(f"True Label: {true_label}\n\n")

# -------------------------------------------------------------------
# EVALUATION
# -------------------------------------------------------------------
filtered_preds = [p for p in y_pt if p != "Unknown"]
filtered_truth = [t for p, t in zip(y_pt, y_gt) if p != "Unknown"]

print("\n=== Evaluation Metrics (Excluding 'Unknown') ===")
print(f"Total predictions: {len(y_pt)}")
print(f"Unknown predictions: {y_pt.count('Unknown')}")
print("Accuracy:", accuracy_score(filtered_truth, filtered_preds))
print("Precision:", precision_score(filtered_truth, filtered_preds, average='macro', zero_division=0))
print("Recall:", recall_score(filtered_truth, filtered_preds, average='macro', zero_division=0))
print("F1 Score:", f1_score(filtered_truth, filtered_preds, average='macro', zero_division=0))



=== Predictions on Test Set ===



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 370/370 [02:23<00:00,  2.57it/s]


=== Evaluation Metrics (Excluding 'Unknown') ===
Total predictions: 370
Unknown predictions: 330
Accuracy: 0.35
Precision: 0.25666666666666665
Recall: 0.22743589743589743
F1 Score: 0.2244122383252818



