In [1]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [2]:
#!pip install --upgrade transformers
#!pip install peft

In [3]:
#!pip install --upgrade datasets

## Load DataSet

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    use_fast=True  # Optional, but usually better performance
)
# Fix: set pad_token
tokenizer.pad_token = tokenizer.eos_token

In [6]:
dataset = load_dataset('json', data_files='TrainingData.json')

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 52
    })
})

In [8]:
# Split into train (80%), validation (10%), and test (10%)
train_dataset = dataset['train'].train_test_split(test_size=0.2)  # 80% train, 20% test
val_test_dataset = train_dataset['test'].train_test_split(test_size=0.5)  # Split the 20% into 50% validation and 50% test

# Now you have train, validation, and test datasets
train_dataset = train_dataset['train']
validation_dataset = val_test_dataset['train']
test_dataset = val_test_dataset['test']

In [9]:
# ----- ADD THIS LABEL MAPPING DICTIONARY -----
label2id = {
    "O": 0,
    "B-Disease": 1,
    "I-Disease": 2,
    "B-Medication": 3,
    "I-Medication": 4,
    "B-Person": 5,
    "I-Person": 6,
    "B-Location": 7,
    "I-Location": 8,
}

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        [ [label['word'] for label in labels] for labels in examples['labels'] ], 
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=16
    )

    all_labels = []

    for i, labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                entity_label = labels[word_id]['entity']
                label_ids.append(label2id.get(entity_label, 0))
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [11]:
# Apply the tokenization and label alignment to the entire dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
# You can check if everything is correct by inspecting a sample
print(tokenized_datasets['train'][0])  # Print the first example of the train dataset

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

{'text': 'The patient was diagnosed with type 2 diabetes and prescribed metformin.', 'labels': [-100, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 3, 3, 3], 'input_ids': [1, 415, 7749, 403, 26629, 395, 1212, 28705, 28750, 22794, 304, 20791, 13284, 1424, 674, 262], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 52
    })
})

In [13]:
validation_dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 5
})

In [14]:
# Apply tokenization and label alignment to dataset splits
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
validation_dataset = validation_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Optional: inspect one sample to verify
print(train_dataset[0])

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

{'text': 'Michael Brown experienced severe headaches due to hypertension.', 'labels': [-100, -100, -100, -100, 5, 6, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0], 'input_ids': [2, 2, 2, 1, 5459, 8364, 8304, 13645, 1335, 5131, 2940, 298, 6521, 2482, 2585, 842], 'attention_mask': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))  # Should be 16

print(tokenized_datasets['train'][0]['labels'])
print(len(tokenized_datasets['train'][0]['labels']))  # Should also be 16


[1, 415, 7749, 403, 26629, 395, 1212, 28705, 28750, 22794, 304, 20791, 13284, 1424, 674, 262]
16
[-100, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 3, 3, 3]
16


### Model Loading, Qunatization

In [16]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA available: True
GPU name: NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [17]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, BitsAndBytesConfig
import torch

In [18]:
# Configure quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # fallback to float16 for broader support
)

In [19]:
# Load model with quantization (this makes it QLoRA)
model = AutoModelForTokenClassification.from_pretrained(
    'mistralai/Mistral-7B-Instruct-v0.1',
    quantization_config=bnb_config,  # This enables quantization
    device_map="auto",
    trust_remote_code=True,
    num_labels=len(label2id )  # Set your number of NER labels
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForTokenClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Checking the whether model is Quantized successully or not

In [20]:
# Check dtype of first parameter
for name, param in model.named_parameters():
    print(f"Parameter: {name}")
    print(f"Data type: {param.dtype}")
    print(f"Shape: {param.shape}")
    print(f"First few values: {param.data.flatten()[:5]}")
    break  # Just show first parameter

Parameter: model.embed_tokens.weight
Data type: torch.float16
Shape: torch.Size([32000, 4096])
First few values: tensor([-0., 0., -0., 0., 0.], device='cuda:0', dtype=torch.float16)


In [21]:
import torch
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3} GB")
print(f"Memory cached: {torch.cuda.memory_reserved() / 1024**3} GB")

Memory allocated: 3.6002368927001953 GB
Memory cached: 3.83203125 GB


In [22]:
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):  # Check if any linear layers are quantized
        print(f"Layer {name} is of type {module.__class__.__name__}")
        print(f"Data type: {module.weight.dtype}")
        break

Layer model.layers.0.self_attn.q_proj is of type Linear4bit
Data type: torch.uint8


Based on the Linear4bit type and the torch.uint8 data type, it is clear that the model has been successfully quantized, specifically with 4-bit precision for some of the layers (such as the query projection in the attention mechanism, i.e., q_proj). The memory usage is also consistent with this, as quantization reduces memory requirements.

Thus, your model is quantized and working as expected, with specific layers like attention heads (query projection) using 4-bit precision.

### 🔧 Importance of `prepare_model_for_kbit_training(model)`

This line is **essential when using quantized models (4-bit / 8-bit)** with LoRA or QLoRA fine-tuning.

#### ✅ What it does:

- Ensures **gradient flow** from inputs to LoRA layers (needed for learning).
- Converts sensitive layers (like LayerNorm) to **float32** for stability.
- Prepares the model for **adapter-based training** (e.g., LoRA).
- Makes the quantized model compatible with **PEFT (Parameter-Efficient Fine-Tuning)**.

#### 🤔 Why it's important:

- Without this line, **LoRA layers may not get updated** (no learning).
- Training may become **unstable or ineffective**.
- Essential step to make quantized models **trainable**.

#### 📌 When to use it:

- After loading a model in **4-bit / 8-bit precision** using `BitsAndBytesConfig`.
- Before applying **LoRA adapters** using `get_peft_model()`.

#### 💡 Example usage:

```python
from peft import prepare_model_for_kbit_training

# Step: Prepare quantized model for training
model = prepare_model_for_kbit_training(model)


In [23]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

In [24]:
model

MistralForTokenClassification(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (

In [25]:
# Configure LoRA (same as before, but now applied to quantized model)
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Mistral uses these names
    lora_dropout=0.1,
    bias="none",
    task_type="TOKEN_CLS"
)

In [26]:
peft_model = get_peft_model(model, lora_config)

In [27]:
peft_model.print_trainable_parameters()

trainable params: 6,852,617 || all params: 7,117,549,586 || trainable%: 0.0963


In [28]:
# Training Arguments remain the same
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,  # Often needs to be slightly higher for QLoRA
    per_device_train_batch_size=10,  # May need to reduce due to quantization overhead
    per_device_eval_batch_size=10,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,  # log every step
    gradient_checkpointing=True,  # Helps save memory
    fp16=True,  # Mixed precision training
    save_strategy="epoch",
)

In [44]:
# Trainer setup
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=validation_dataset,
)

In [45]:
# Start training
trainer.train()

Step,Training Loss
5,0.9443
10,0.3074
15,0.3107


TrainOutput(global_step=18, training_loss=0.46627191702524823, metrics={'train_runtime': 25.7452, 'train_samples_per_second': 6.059, 'train_steps_per_second': 0.699, 'total_flos': 104629488327936.0, 'train_loss': 0.46627191702524823, 'epoch': 3.0})

In [46]:
metrics = trainer.evaluate(eval_dataset=validation_dataset)
print(metrics)

{'eval_loss': 0.004685596562922001, 'eval_runtime': 0.5449, 'eval_samples_per_second': 9.176, 'eval_steps_per_second': 1.835, 'epoch': 3.0}


🔍 Fine-tuning Summary (Epoch 3)
────────────────────────────────────────────────────────

📉 Training Loss:
   → 0.4662
   This is low and suggests the model has learned the training data well.

🧪 Evaluation Loss:
   → 0.0046
   A very low eval loss, indicating strong generalization to unseen data.

⚙️ Training Metrics:
   - Epochs completed: 3
   - Global steps: 18
   - Training runtime: 25.74s
   - Samples/sec: 6.059
   - Steps/sec: 0.69

📊 Evaluation Metrics:
   - Runtime: 0.5449s
   - Eval samples/sec: 9.176
   - Eval steps/sec: 1.835

✅ Key Takeaways:
────────────────────────────────────────────────────────
- ✔️ Training and evaluation losses are both low — this is a strong sign that:
    - The model is **learning the task well**.
    - There’s **no sign of overfitting** (low train and eval loss).
    - Your **data preprocessing and token-label alignment** are likely working correctly.

### Testing

In [47]:
# Define your labels in order
labels = [
    "O",
    "B-Disease",
    "I-Disease",
    "B-Medication",
    "I-Medication",
    "B-Person",
    "I-Person",
    "B-Location",
    "I-Location"
]

# Create label ↔ ID mappings
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}


In [48]:
id2label

{0: 'O',
 1: 'B-Disease',
 2: 'I-Disease',
 3: 'B-Medication',
 4: 'I-Medication',
 5: 'B-Person',
 6: 'I-Person',
 7: 'B-Location',
 8: 'I-Location'}

In [49]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model=trainer.model,       # No need to reload
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

Device set to use cuda:0


In [50]:
def GenerateNER(text,pipeline,id2label):
    output = ner_pipeline(text)
    for entity in output:
        label_id = int(entity['entity_group'].split("_")[-1])
        print(f"{entity['word']} -> {id2label[label_id]} ({entity['score']:.2f})")

In [51]:
text = "Dr. John gave amoxicillin for severe pneumonia."
GenerateNER(text,pipeline,id2label)

Dr. -> B-Person (1.00)
John -> I-Person (1.00)
gave am -> O (0.95)
oxicillin -> B-Medication (1.00)
for severe -> O (0.82)
pneum -> B-Disease (1.00)
onia -> I-Disease (0.99)
. -> O (1.00)


## Evaluation

In [52]:
test_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5
})

In [53]:
from sklearn.metrics import precision_recall_fscore_support

# Get predictions on the test set
predictions = trainer.predict(test_dataset)

# Get the predicted and true labels
y_pred = predictions.predictions.argmax(axis=-1)
y_true = predictions.label_ids

# Flatten predictions and labels, skipping padding tokens (-100)
true_labels = []
pred_labels = []

for pred, true in zip(y_pred, y_true):
    for p, t in zip(pred, true):
        if t != -100:
            true_labels.append(t)
            pred_labels.append(p)

# Compute precision, recall, and F1
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


Precision: 0.8107, Recall: 0.8358, F1: 0.8156


  _warn_prf(average, modifier, msg_start, len(result))


## Random New Test Dataset

In [54]:
sample_data = [
  {
    "text": "Michael Johnson was treated with paracetamol at Mercy Hospital.",
    "labels": [
      {"word": "Michael", "entity": "B-Person"},
      {"word": "Johnson", "entity": "I-Person"},
      {"word": "was", "entity": "O"},
      {"word": "treated", "entity": "O"},
      {"word": "with", "entity": "O"},
      {"word": "paracetamol", "entity": "B-Medication"},
      {"word": "at", "entity": "O"},
      {"word": "Mercy", "entity": "B-Location"},
      {"word": "Hospital", "entity": "I-Location"},
      {"word": ".", "entity": "O"}
    ]
  },
  {
    "text": "She was diagnosed with COVID 19 and given remdesivir.",
    "labels": [
      {"word": "She", "entity": "O"},
      {"word": "was", "entity": "O"},
      {"word": "diagnosed", "entity": "O"},
      {"word": "with", "entity": "O"},
      {"word": "COVID", "entity": "B-Disease"},
      {"word": "19", "entity": "I-Disease"},
      {"word": "and", "entity": "O"},
      {"word": "given", "entity": "O"},
      {"word": "remdesivir", "entity": "B-Medication"},
      {"word": ".", "entity": "O"}
    ]
  },
  {
    "text": "The patient complained of chest pain and was given nitroglycerin.",
    "labels": [
      {"word": "The", "entity": "O"},
      {"word": "patient", "entity": "O"},
      {"word": "complained", "entity": "O"},
      {"word": "of", "entity": "O"},
      {"word": "chest", "entity": "B-Disease"},
      {"word": "pain", "entity": "I-Disease"},
      {"word": "and", "entity": "O"},
      {"word": "was", "entity": "O"},
      {"word": "given", "entity": "O"},
      {"word": "nitroglycerin", "entity": "B-Medication"},
      {"word": ".", "entity": "O"}
    ]
  },
  {
    "text": "Dr. Laura White reviewed the MRI scan for spinal injury.",
    "labels": [
      {"word": "Dr.", "entity": "B-Person"},
      {"word": "Laura", "entity": "I-Person"},
      {"word": "White", "entity": "I-Person"},
      {"word": "reviewed", "entity": "O"},
      {"word": "the", "entity": "O"},
      {"word": "MRI", "entity": "O"},
      {"word": "scan", "entity": "O"},
      {"word": "for", "entity": "O"},
      {"word": "spinal", "entity": "B-Disease"},
      {"word": "injury", "entity": "I-Disease"},
      {"word": ".", "entity": "O"}
    ]
  },
  {
    "text": "They visited Stanford Medical Center for surgery.",
    "labels": [
      {"word": "They", "entity": "O"},
      {"word": "visited", "entity": "O"},
      {"word": "Stanford", "entity": "B-Location"},
      {"word": "Medical", "entity": "I-Location"},
      {"word": "Center", "entity": "I-Location"},
      {"word": "for", "entity": "O"},
      {"word": "surgery", "entity": "O"},
      {"word": ".", "entity": "O"}
    ]
  }
]


In [55]:
from datasets import Dataset

# Create Dataset from list
dataset = Dataset.from_list(sample_data)

# Check
print(dataset)


Dataset({
    features: ['text', 'labels'],
    num_rows: 5
})


In [56]:
test_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [57]:
test_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5
})

In [58]:
from sklearn.metrics import precision_recall_fscore_support

# Get predictions on the test set
predictions = trainer.predict(test_dataset)

# Get the predicted and true labels
y_pred = predictions.predictions.argmax(axis=-1)
y_true = predictions.label_ids

# Flatten predictions and labels, skipping padding tokens (-100)
true_labels = []
pred_labels = []

for pred, true in zip(y_pred, y_true):
    for p, t in zip(pred, true):
        if t != -100:
            true_labels.append(t)
            pred_labels.append(p)

# Compute precision, recall, and F1
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


Precision: 0.8107, Recall: 0.8358, F1: 0.8156


  _warn_prf(average, modifier, msg_start, len(result))


## Pushing model to huggingface

In [59]:
!pip install huggingface_hub



In [62]:
from peft import PeftModel
from huggingface_hub import notebook_login

notebook_login()  # Or CLI login

peft_model.push_to_hub("Aghori/mistral-medical-ner-qlora")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.safetensors:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Aghori/mistral-medical-ner-qlora/commit/42dcca8720e53a3a01d2e801bbbd9d8f7483f6a8', commit_message='Upload model', commit_description='', oid='42dcca8720e53a3a01d2e801bbbd9d8f7483f6a8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Aghori/mistral-medical-ner-qlora', endpoint='https://huggingface.co', repo_type='model', repo_id='Aghori/mistral-medical-ner-qlora'), pr_revision=None, pr_num=None)

In [63]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("Aghori/mistral-medical-ner-qlora", torch_dtype="auto")

adapter_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

Loading adapter weights from Aghori/mistral-medical-ner-qlora led to unexpected keys not found in the model: model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.self_attn.q_proj.lora_A.default.weight, model.layers.0.self_attn.q_proj.lora_B.default.weight, model.layers.0.self_attn.v_proj.lora_A.default.weight, model.layers.0.self_attn.v_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.lora_A.default.weight, model.layers.1.self_attn.k_proj.lora_B.default.weight, model.layers.1.self_attn.o_proj.lora_A.default.weight, model.layers.1.self_attn.o_proj.lora_B.default.weight, model.layers.1.self_attn.q_proj.lora_A.default.weight, model.layers.1.self_attn.q_proj.lora_B.default.weight, model.layers.1.self_attn.v_proj.lora_A.default.weight, model.layers.1.self_attn.v_proj.lora_B.default.weight, model.layer

AttributeError: 'MistralModel' object has no attribute 'predict'