<a href="https://colab.research.google.com/github/Dhwani123p/MediScripts/blob/main/ML-model/ner/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install torch torchcrf scikit-learn



In [34]:
def load_conll(path):
    sentences, labels = [], []
    words, tags = [], []

    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                w, t = line.split()
                words.append(w)
                tags.append(t)
    return sentences, labels


train_sents, train_tags = load_conll("/content/synthetic_train (1).conll")
valid_sents, valid_tags = load_conll("/content/valid.conll.txt")

print(len(train_sents), len(valid_sents))


115 11


In [35]:
from collections import Counter

def build_vocab(sentences):
    vocab = {"<PAD>":0, "<UNK>":1}
    for s in sentences:
        for w in s:
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

word_vocab = build_vocab(train_sents + valid_sents)

label_list = sorted({t for seq in train_tags + valid_tags for t in seq})
label_to_id = {l:i for i,l in enumerate(label_list)}
id_to_label = {i:l for l,i in label_to_id.items()}


In [36]:
import torch

def encode(sentences, tags, vocab, label_map):
    X, Y, M = [], [], []
    max_len = max(len(s) for s in sentences)

    for s, t in zip(sentences, tags):
        x = [vocab.get(w, vocab["<UNK>"]) for w in s]
        y = [label_map[tag] for tag in t]
        mask = [1]*len(x)

        while len(x) < max_len:
            x.append(vocab["<PAD>"])
            y.append(0)
            mask.append(0)

        X.append(x)
        Y.append(y)
        M.append(mask)

    return (
        torch.tensor(X),
        torch.tensor(Y),
        torch.tensor(M, dtype=torch.bool)
    )


X_train, Y_train, M_train = encode(train_sents, train_tags, word_vocab, label_to_id)
X_valid, Y_valid, M_valid = encode(valid_sents, valid_tags, word_vocab, label_to_id)


In [37]:
!pip install pytorch-crf


import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, emb_dim=100, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(
            emb_dim,
            hidden_dim // 2,
            batch_first=True,
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        emissions = self.fc(out)

        if tags is not None:
            return -self.crf(emissions, tags, mask=mask)
        else:
            return self.crf.decode(emissions, mask=mask)



In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = BiLSTM_CRF(
    vocab_size=len(word_vocab),
    tagset_size=len(label_to_id)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    optimizer.zero_grad()
    loss = model(
        X_train.to(device),
        Y_train.to(device),
        mask=M_train.to(device)
    )
    loss.backward()
    optimizer.step()
    train_loss = loss.item()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = model(
            X_valid.to(device),
            Y_valid.to(device),
            mask=M_valid.to(device)
        ).item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Epoch 1/10 | Train Loss: 2371.1060 | Val Loss: 197.8741
Epoch 2/10 | Train Loss: 2303.7832 | Val Loss: 193.7629
Epoch 3/10 | Train Loss: 2237.2571 | Val Loss: 189.6720
Epoch 4/10 | Train Loss: 2170.7812 | Val Loss: 185.5614
Epoch 5/10 | Train Loss: 2103.6477 | Val Loss: 181.3959
Epoch 6/10 | Train Loss: 2035.2355 | Val Loss: 177.1446
Epoch 7/10 | Train Loss: 1965.0325 | Val Loss: 172.7823
Epoch 8/10 | Train Loss: 1892.6619 | Val Loss: 168.2902
Epoch 9/10 | Train Loss: 1817.9137 | Val Loss: 163.6576
Epoch 10/10 | Train Loss: 1740.7722 | Val Loss: 158.8833


In [40]:
torch.save({
    "model_state": model.state_dict(),
    "vocab": word_vocab,
    "label_to_id": label_to_id
}, "mediscript_ner.pt")


In [46]:
test_sents, test_tags = load_conll("/content/test.conll.txt")


In [47]:
X_test, Y_test, M_test = encode(
    test_sents,
    test_tags,
    word_vocab,
    label_to_id
)


In [48]:
model.eval()

all_true = []
all_pred = []

with torch.no_grad():
    predictions = model(
        X_test.to(device),
        mask=M_test.to(device)
    )

for i in range(len(predictions)):
    seq_len = M_test[i].sum().item()
    true_seq = Y_test[i][:seq_len].tolist()
    pred_seq = predictions[i]

    all_true.extend(true_seq)
    all_pred.extend(pred_seq)


In [49]:
true_labels = [id_to_label[i] for i in all_true]
pred_labels = [id_to_label[i] for i in all_pred]


In [50]:
from sklearn.metrics import classification_report

print(classification_report(
    true_labels,
    pred_labels,
    digits=4
))


              precision    recall  f1-score   support

      B-DOSE     0.8421    0.7273    0.7805        22
      B-DRUG     1.0000    0.5417    0.7027        24
       B-DUR     0.7000    0.5000    0.5833        14
      B-FREQ     0.6000    0.3750    0.4615        24
     B-ROUTE     0.7778    0.7000    0.7368        10
      I-DOSE     0.8000    0.9091    0.8511        22
      I-DRUG     0.0000    0.0000    0.0000         5
       I-DUR     1.0000    0.6923    0.8182        13
      I-FREQ     0.5306    0.7879    0.6341        33
     I-ROUTE     0.2727    1.0000    0.4286        12
           O     0.0000    0.0000    0.0000        14

    accuracy                         0.6166       193
   macro avg     0.5930    0.5667    0.5452       193
weighted avg     0.6523    0.6166    0.6014       193



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
text = "Paracetamol 650 mg twice a day for 3 days after food"



In [53]:
tokens = text.split()

encoded = [
    word_vocab.get(tok, word_vocab["<UNK>"])
    for tok in tokens
]

X = torch.tensor([encoded]).to(device)
mask = torch.tensor([[1]*len(encoded)], dtype=torch.bool).to(device)


In [54]:
model.eval()

with torch.no_grad():
    pred_ids = model(X, mask=mask)[0]


In [55]:
for tok, tag_id in zip(tokens, pred_ids):
    print(f"{tok:12} → {id_to_label[tag_id]}")


Paracetamol  → B-DRUG
650          → B-DOSE
mg           → I-DOSE
twice        → I-FREQ
a            → I-FREQ
day          → I-FREQ
for          → I-FREQ
3            → B-DUR
days         → I-DUR
after        → B-ROUTE
food         → I-ROUTE
