In [1]:
import pandas as pd

df = pd.read_csv("Dataset/ner.csv", sep=",", encoding="latin1", engine="python", on_bad_lines="skip")

# Drop the unwanted unnamed column if it exists
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

print(df.shape)
print(df.columns.tolist())
print(df.head())


(1050795, 24)
['lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos', 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape', 'next-word', 'pos', 'prev-iob', 'prev-lemma', 'prev-pos', 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape', 'prev-prev-word', 'prev-shape', 'prev-word', 'sentence_idx', 'shape', 'word', 'tag']
      lemma next-lemma next-next-lemma next-next-pos next-next-shape  \
0  thousand         of        demonstr           NNS       lowercase   
1        of   demonstr            have           VBP       lowercase   
2  demonstr       have           march           VBN       lowercase   
3      have      march         through            IN       lowercase   
4     march    through          london           NNP     capitalized   

  next-next-word next-pos next-shape      next-word  pos  ... prev-prev-lemma  \
0  demonstrators       IN  lowercase             of  NNS  ...      __start2__   
1           have      NNS  lowercase  demonstrators  

In [64]:
train_sents = []
for idx, group in df.groupby("sentence_idx"):   # change if your column name is different
    sentence = []
    for _, row in group.iterrows():
        word = str(row["word"]) if pd.notna(row["word"]) else ""   # safe string conversion
        tag = row["tag"] if pd.notna(row["tag"]) else "PAD"
        sentence.append((word, tag))   # append only once
    train_sents.append(sentence)

print("Number of sentences:", len(train_sents))
print("First sentence example:", train_sents[0][:10])


Number of sentences: 35177
First sentence example: [('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O')]


In [93]:
# Create character vocabulary
all_chars = {c for sent in train_sents for word, tag in sent for c in word}

char2idx = {"PAD": 0, "UNK": 1}
char2idx.update({c: i for i, c in enumerate(sorted(all_chars), start=2)})

print("Char vocab size:", len(char2idx))


Char vocab size: 98


In [94]:
from sklearn.model_selection import train_test_split

# train_sents is your full list of sentences (word, tag pairs)
train_sents_train, train_sents_test = train_test_split(
    train_sents, test_size=0.2, random_state=42
)

# Now collect all unique words from both splits
all_words_from_train_and_test = set(
    [str(w) for sent in train_sents_train + train_sents_test for w, t in sent]
)


# Build word2idx mapping
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i+2 for i, w in enumerate(sorted(all_words_from_train_and_test))})

print("Training sentences:", len(train_sents_train))
print("Test sentences:", len(train_sents_test))
print("Vocab size:", len(word2idx))


Training sentences: 28141
Test sentences: 7036
Vocab size: 30173


In [95]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(s) for s in train_sents)
max_word_len = 15  # max chars per word

# Build word2idx and tag2idx
words = sorted(set([str(w) for sent in train_sents for w, t in sent]))
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i+2 for i, w in enumerate(words)})




tags = sorted(set(df["tag"].dropna().values))
tag2idx = {"PAD": 0}
for i, t in enumerate(tags, start=1):
    tag2idx[t] = i

# Encode one sentence
def encode_sentence(sent, word2idx, tag2idx, char2idx, max_len=max_len, max_word_len=max_word_len):
    word_ids, char_ids, tag_ids = [], [], []
    
    for word, tag in sent[:max_len]:
        word_ids.append(word2idx.get(word, word2idx["UNK"]))
        tag_ids.append(tag2idx.get(tag, tag2idx["PAD"]))
        
        # char-level indices
        chars = [char2idx.get(c, char2idx["UNK"]) for c in word[:max_word_len]]
        # pad chars
        while len(chars) < max_word_len:
            chars.append(char2idx["PAD"])
        char_ids.append(chars)
    
    # pad sentence-level
    while len(word_ids) < max_len:
        word_ids.append(word2idx["PAD"])
        tag_ids.append(tag2idx["PAD"])
        char_ids.append([char2idx["PAD"]] * max_word_len)
    
    return word_ids, char_ids, tag_ids

# Encode all sentences
X_words, X_chars, y_tags = [], [], []
for sent in train_sents:
    w, c, t = encode_sentence(sent, word2idx, tag2idx, char2idx)
    X_words.append(w)
    X_chars.append(c)
    y_tags.append(t)

import numpy as np
X_words = np.array(X_words)
X_chars = np.array(X_chars)
y_tags  = np.array(y_tags)

print("X_words:", X_words.shape)
print("X_chars:", X_chars.shape)
print("y_tags:", y_tags.shape)


X_words: (35177, 140)
X_chars: (35177, 140, 15)
y_tags: (35177, 140)


In [96]:
from collections import defaultdict

# Extract words and tags from dataset
words = list(set(df["word"].dropna().values))   # dropna() ensures no NaN values
tags = sorted(set(df["tag"].dropna().values))   # sorted for consistency

# Create word2idx mapping (+2 for PAD and UNK)
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

# Create tag2idx mapping
tag2idx = {"PAD": 0}
tag2idx.update({t: i+1 for i, t in enumerate(tags)})

# Reverse mapping for decoding predictions
idx2tag = {i: t for t, i in tag2idx.items()}

print("Vocab size:", len(word2idx))
print("Number of tags:", len(tag2idx))


Vocab size: 30172
Number of tags: 18


In [97]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

# Ensure PAD and UNK exist in word2idx
if "PAD" not in word2idx:
    word2idx["PAD"] = len(word2idx)
if "UNK" not in word2idx:
    word2idx["UNK"] = len(word2idx)

# Split sentences
train_sents_train, train_sents_test = train_test_split(train_sents, test_size=0.2, random_state=42)

# Maximum sentence length for padding
max_len = max(len(s) for s in train_sents_train + train_sents_test)
print("Max sentence length:", max_len)

# Encode word indices
X_train = [[word2idx.get(w, word2idx["UNK"]) for w, t in s] for s in train_sents_train]
X_test  = [[word2idx.get(w, word2idx["UNK"]) for w, t in s] for s in train_sents_test]

# Pad sequences to max_len
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', value=word2idx["PAD"])
X_test  = pad_sequences(X_test,  maxlen=max_len, padding='post', value=word2idx["PAD"])

print("Embedding vocab size:", len(word2idx))
print("X_train shape:", X_train.shape, "Max index in X_train:", X_train.max())
print("X_test shape:", X_test.shape, "Max index in X_test:", X_test.max())


Max sentence length: 140
Embedding vocab size: 30172
X_train shape: (28141, 140) Max index in X_train: 30172
X_test shape: (7036, 140) Max index in X_test: 30172


In [None]:
import numpy as np
import torch  # 

def load_glove_embeddings(glove_path, word2idx, embedding_dim=100):
    embeddings_index = {}
    with open(glove_path, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = vector

    #  Add +1 to handle max index properly
    embedding_matrix = np.zeros((len(word2idx) + 1, embedding_dim))

    for word, idx in word2idx.items():
        vec = embeddings_index.get(word.lower())
        if vec is not None:
            embedding_matrix[idx] = vec

    return torch.tensor(embedding_matrix, dtype=torch.float32)



In [None]:
import torch

glove_path = "glove.6B.100d.txt"  # make sure this file exists

# Check if embeddings are loaded successfully
embedding_matrix = load_glove_embeddings(glove_path, word2idx, embedding_dim=100)

if embedding_matrix is not None:
    print("Embedding matrix shape:", embedding_matrix.shape)
else:
    print("Embedding matrix could not be created. Check GloVe path and word2idx.")


Embedding matrix shape: torch.Size([30173, 100])


In [100]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(s) for s in train_sents)

# Ensure PAD and UNK tokens exist in vocab
if "PAD" not in word2idx:
    word2idx["PAD"] = len(word2idx)
if "UNK" not in word2idx:
    word2idx["UNK"] = len(word2idx)

# Rebuild X with safe mapping
X = [[word2idx.get(w, word2idx["UNK"]) for w, t in s] for s in train_sents]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

# Build tag dictionary with PAD
tags = sorted(set(df["tag"].dropna().values))
tag2idx = {"PAD": 0}
tag2idx.update({t: i+1 for i, t in enumerate(tags)})
idx2tag = {i: t for t, i in tag2idx.items()}

# Rebuild y
y = [[tag2idx[t] for w, t in s] for s in train_sents]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["PAD"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Vocab size:", len(word2idx))
print("Number of tags:", len(tag2idx))
print("Max index in X:", X.max())
print("Max index in y:", y.max())


X shape: (35177, 140)
y shape: (35177, 140)
Vocab size: 30172
Number of tags: 18
Max index in X: 30172
Max index in y: 17


In [101]:
X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)


In [102]:
# Encode dataset into word, char, and tag ids
X_words, X_chars, y_tags = [], [], []

for sent in train_sents:  # <---- make sure you’re using train_sents
    w, c, t = encode_sentence(sent, word2idx, tag2idx, char2idx, max_len=140, max_word_len=max_word_len)
    X_words.append(w)
    X_chars.append(c)
    y_tags.append(t)

X_words = torch.tensor(X_words, dtype=torch.long)
X_chars = torch.tensor(X_chars, dtype=torch.long)   # no need for np.array
y_tags  = torch.tensor(y_tags, dtype=torch.long)

print("Shapes -> Words:", X_words.shape, "Chars:", X_chars.shape, "Tags:", y_tags.shape)


Shapes -> Words: torch.Size([35177, 140]) Chars: torch.Size([35177, 140, 15]) Tags: torch.Size([35177, 140])


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

#  Make sure everything is numpy first
X_words = np.array(X_words)
X_chars = np.array(X_chars)   # (num_sents, seq_len, max_word_len)
y_tags  = np.array(y_tags)

# Dataset
class NERDataset(Dataset):
    def __init__(self, X_words, X_chars, y_tags, pad_tag_idx=0):
        self.words = torch.tensor(X_words, dtype=torch.long)
        self.chars = torch.tensor(X_chars, dtype=torch.long)
        self.tags  = torch.tensor(y_tags, dtype=torch.long)
        self.pad_tag_idx = pad_tag_idx

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word_ids = self.words[idx]
        char_ids = self.chars[idx]
        tag_ids  = self.tags[idx]

        #  mask for real tokens only
        mask = (tag_ids != self.pad_tag_idx)

        return word_ids, char_ids, tag_ids, mask


# Split numpy arrays first
X_words_train, X_words_test, X_chars_train, X_chars_test, y_tags_train, y_tags_test = train_test_split(
    X_words, X_chars, y_tags, test_size=0.2, random_state=42
)

# ✅ Create datasets (conversion to torch happens inside NERDataset)
train_dataset = NERDataset(X_words_train, X_chars_train, y_tags_train, pad_tag_idx=tag2idx["PAD"])
test_dataset  = NERDataset(X_words_test,  X_chars_test,  y_tags_test,  pad_tag_idx=tag2idx["PAD"])

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)


# 🔍 Check one batch
for word_ids, char_ids, tag_ids, mask in train_loader:
    print("Words:", word_ids.shape)   # (B, seq_len)
    print("Chars:", char_ids.shape)   # (B, seq_len, max_word_len)
    print("Tags:", tag_ids.shape)     # (B, seq_len)
    print("Mask:", mask.shape)        # (B, seq_len)
    break


Words: torch.Size([32, 140])
Chars: torch.Size([32, 140, 15])
Tags: torch.Size([32, 140])
Mask: torch.Size([32, 140])


In [104]:
assert X.shape[0] == y.shape[0], "Mismatch: number of sentences differs!"
assert X.shape[1] == y.shape[1], "Mismatch: sequence lengths differ!"


In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)


Train shapes: torch.Size([31659, 140]) torch.Size([31659, 140])
Test shapes: torch.Size([3518, 140]) torch.Size([3518, 140])


In [106]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]
assert X_train.shape[1] == y_train.shape[1] == X_test.shape[1] == y_test.shape[1]


In [119]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiGRU_CRF(nn.Module):
    def __init__(self, vocab_size, char_vocab_size, word_emb_dim, char_emb_dim, 
                 hidden_dim, char_hidden_dim, num_tags, pad_idx):
        super(BiGRU_CRF, self).__init__()
        
        # Word embeddings
        self.word_emb = nn.Embedding(vocab_size, word_emb_dim, padding_idx=pad_idx)

        # Char embeddings + Char-level BiGRU
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=pad_idx)
        self.char_gru = nn.GRU(char_emb_dim, char_hidden_dim, batch_first=True, bidirectional=True)

        # Dropout layer (⚡ Add this)
        self.dropout = nn.Dropout(0.5)

        # Word-level BiGRU
        self.bigru = nn.GRU(word_emb_dim + 2*char_hidden_dim, hidden_dim, 
                            batch_first=True, bidirectional=True)

        # Linear layer to tag space
        self.fc = nn.Linear(2*hidden_dim, num_tags)

        # CRF layer
        self.crf = CRF(num_tags, batch_first=True)



    def forward(self, word_ids, char_ids):
        """
        word_ids: (batch, seq_len)
        char_ids: (batch, seq_len, word_len)
        """
        # Word embeddings
        word_emb = self.word_emb(word_ids)

        # Char embeddings
        batch_size, seq_len, word_len = char_ids.size()
        char_ids = char_ids.long()

        char_ids = char_ids.view(batch_size * seq_len, word_len)  # flatten
        char_emb = self.char_emb(char_ids)  # (batch*seq_len, word_len, char_emb_dim)

        _, char_hidden = self.char_gru(char_emb)  # last hidden states
        char_repr = torch.cat([char_hidden[0], char_hidden[1]], dim=-1)  # (batch*seq_len, 2*char_hidden_dim)
        char_repr = char_repr.view(batch_size, seq_len, -1)  # reshape back

        # Combine word + char
        embeddings = torch.cat([word_emb, char_repr], dim=-1)
        embeddings = self.dropout(embeddings)

        # Word-level BiGRU
        gru_out, _ = self.bigru(embeddings)
        gru_out = self.dropout(gru_out)

        emissions = self.fc(gru_out)
        return emissions

    def loss(self, word_ids, char_ids, tags, mask):
        emissions = self.forward(word_ids, char_ids)
        return -self.crf(emissions, tags, mask=mask, reduction='mean')

    def predict(self, word_ids, char_ids, mask):
        emissions = self.forward(word_ids, char_ids)
        return self.crf.decode(emissions, mask=mask)


In [124]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiGRU_CRF(
    vocab_size=len(word2idx),
    num_tags=len(tag2idx),
    char_vocab_size=len(char2idx),
    word_emb_dim=100,      # size of word embeddings
    char_emb_dim=30,       # size of character embeddings
    char_hidden_dim=50,    # hidden dim for char-level BiGRU
    hidden_dim=128,        # BiGRU hidden size
    pad_idx=word2idx["PAD"]
).to(device)


In [125]:
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [126]:
print("Max index in X_train:", X_train.max())
print("Vocab size:", len(word2idx))


Max index in X_train: tensor(30172)
Vocab size: 30172


In [127]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_words_train, X_words_test, X_chars_train, X_chars_test, y_train, y_test = train_test_split(
    X_words, X_chars, y_tags, test_size=0.1, random_state=42
)

# Convert to torch tensors and create mask
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_words_train, dtype=torch.long),
    torch.tensor(X_chars_train, dtype=torch.long),
    torch.tensor(y_train, dtype=torch.long),
    torch.tensor((y_train != tag2idx["PAD"]), dtype=torch.bool)
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_words_test, dtype=torch.long),
    torch.tensor(X_chars_test, dtype=torch.long),
    torch.tensor(y_test, dtype=torch.long),
    torch.tensor((y_test != tag2idx["PAD"]), dtype=torch.bool)
)

# DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=32)


In [128]:
print("Max char index in train:", X_chars_train.max())
print("Max char index in test:", X_chars_test.max())
print("Char vocab size:", len(char2idx))


Max char index in train: 96
Max char index in test: 97
Char vocab size: 98


In [129]:
batch = next(iter(train_loader))
print(len(batch))  # should be 4
for x in batch:
    print(x.shape)



4
torch.Size([32, 140])
torch.Size([32, 140, 15])
torch.Size([32, 140])
torch.Size([32, 140])


In [130]:
from sklearn.model_selection import train_test_split

# Assuming you already have these prepared:
# X_words, X_chars, y_tags, sentences (all same length)

(
    X_words_train, X_words_test,
    X_chars_train, X_chars_test,
    y_tags_train, y_tags_test
) = train_test_split(
    X_words, X_chars, y_tags,
    test_size=0.1,
    random_state=42
)

print("Number of train samples:", len(X_words_train))
print("Number of test samples:", len(X_words_test))



Number of train samples: 31659
Number of test samples: 3518


In [131]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Maximum sequence length for padding
max_len = 140  

# ✅ Convert words to indices with UNK fallback
X_words_train = [
    [word2idx.get(w, word2idx["UNK"]) for w, t in sent]
    for sent in train_sents_train  # <-- using the correct training sentences list
]

X_words_test = [
    [word2idx.get(w, word2idx["UNK"]) for w, t in sent]
    for sent in train_sents_test   # <-- using the correct test sentences list
]

# ✅ Pad sequences
X_words_train = pad_sequences(
    X_words_train, maxlen=max_len, padding="post", value=word2idx["PAD"]
)
X_words_test = pad_sequences(
    X_words_test, maxlen=max_len, padding="post", value=word2idx["PAD"]
)

# ✅ Debugging info
print("X_words_train shape:", X_words_train.shape)
print("X_words_test shape:", X_words_test.shape)


X_words_train shape: (28141, 140)
X_words_test shape: (7036, 140)


In [132]:
from tqdm import tqdm
import torch

n_epochs = 50
patience = 5   # stop if no improvement for 5 epochs
best_val_acc = 0
patience_counter = 0

for epoch in range(n_epochs):
    model.train()
    total_loss = 0

    # --- Training ---
    for word_ids, char_ids, tag_ids, _ in train_loader:  # ignore precomputed mask
        word_ids = word_ids.to(device)
        char_ids = char_ids.to(device)
        tag_ids = tag_ids.to(device)

        # recompute mask
        mask = (tag_ids != tag2idx["PAD"])

        optimizer.zero_grad()
        loss = model.loss(word_ids, char_ids, tag_ids, mask=mask)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {avg_loss:.4f}")

    # --- Validation ---
    model.eval()
    with torch.no_grad():
        correct, total = 0, 0
        for word_ids, char_ids, tag_ids, _ in test_loader:
            word_ids = word_ids.to(device)
            char_ids = char_ids.to(device)
            tag_ids = tag_ids.to(device)

            # recompute mask
            mask = (tag_ids != tag2idx["PAD"])

            predictions = model.predict(word_ids, char_ids, mask=mask)

            for pred_seq, true_seq, mask_seq in zip(predictions, tag_ids, mask):
                true_seq = true_seq[mask_seq].tolist()  # only real tokens
                assert len(pred_seq) == len(true_seq)

                total += len(true_seq)
                correct += sum(p == t for p, t in zip(pred_seq, true_seq))

        val_acc = correct / total if total > 0 else 0
        print(f"Validation Accuracy: {val_acc:.4f}")

        # --- Early Stopping ---
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
            print("✅ Model improved. Saved checkpoint.")
        else:
            patience_counter += 1
            print(f"⚠️ No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("⏹ Early stopping triggered.")
                break


IndexError: index out of range in self

In [55]:
from seqeval.metrics import classification_report, f1_score

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for X_batch, y_batch, mask in test_loader:
        X_batch, y_batch, mask = X_batch.to(device), y_batch.to(device), mask.to(device)
        preds = model.predict(X_batch, mask)
        for p, y, m in zip(preds, y_batch, mask):
            # keep only non-PAD tokens
            true = [idx2tag[idx.item()] for idx, mask_val in zip(y, m) if mask_val == 1]
            pred = [idx2tag[idx] for idx, mask_val in zip(p, m) if mask_val == 1]
            all_labels.append(true)
            all_preds.append(pred)

print(classification_report(all_labels, all_preds))
print("Overall F1:", f1_score(all_labels, all_preds))


              precision    recall  f1-score   support

         art       0.12      0.10      0.11        40
         eve       0.28      0.33      0.30        30
         geo       0.82      0.84      0.83      3792
         gpe       0.89      0.93      0.91      1608
         nat       0.40      0.36      0.38        22
         org       0.66      0.64      0.65      1989
         per       0.71      0.70      0.70      1653
         tim       0.86      0.84      0.85      2040

   micro avg       0.79      0.79      0.79     11174
   macro avg       0.59      0.59      0.59     11174
weighted avg       0.79      0.79      0.79     11174

Overall F1: 0.7888695341462325


In [56]:
def show_predictions(sentence_idx=0):
    words = [w for w, t in train_sents[sentence_idx]]
    true_tags = [t for w, t in train_sents[sentence_idx]]
    X_sample = torch.tensor([X_test[sentence_idx]], dtype=torch.long).to(device)
    mask = (X_sample != word2idx["PAD"]).to(device)

    pred_tags = model.predict(X_sample, mask)[0]
    pred_tags = [idx2tag[idx] for idx in pred_tags]

    for w, t, p in zip(words, true_tags, pred_tags):
        print(f"{w:15}  True: {t:5}  Pred: {p:5}")


In [57]:
torch.save(model.state_dict(), "bigru_crf_ner.pth")



In [58]:
def predict_sentence(sentence):
    # convert words to indices
    x = [word2idx.get(w, word2idx["UNK"]) for w in sentence]
    x = torch.tensor([x], dtype=torch.long).to(device)

    mask = (x != word2idx["PAD"]).to(device)
    preds = model.predict(x, mask)[0]

    return list(zip(sentence, [idx2tag[idx] for idx in preds]))

# Example usage
print(predict_sentence(["Akash", "is", "working", "in", "Senscript", "located","in","Kochi", "."]))


[('Akash', 'O'), ('is', 'O'), ('working', 'O'), ('in', 'O'), ('Senscript', 'O'), ('located', 'O'), ('in', 'O'), ('Kochi', 'O'), ('.', 'O')]


In [41]:
def predict_sentence(sentence, model, word2idx, idx2tag, device="cpu"):
    """
    Predict NER tags for a given input sentence.
    sentence: list of words (already tokenized).
    """
    model.eval()

    # Convert words to indices (UNK for unknown)
    x = [word2idx.get(w, word2idx["UNK"]) for w in sentence]
    x = torch.tensor([x], dtype=torch.long).to(device)

    # Mask (1 for real tokens, 0 for PAD)
    mask = (x != word2idx["PAD"]).to(device)

    # Predict
    preds = model.predict(x, mask)[0]
    pred_tags = [idx2tag[idx] for idx in preds]

    return list(zip(sentence, pred_tags))


# 🔹 Example usage with user input
while True:
    user_input = input("\nEnter a sentence (or 'quit' to exit): ")
    if user_input.lower() == "quit":
        break

    # Very basic tokenization (split by spaces)
    words = user_input.strip().split()
    predictions = predict_sentence(words, model, word2idx, idx2tag, device)

    print("\nPredicted NER tags:")
    for w, t in predictions:
        print(f"{w:15} -> {t}")



Predicted NER tags:
Akash           -> O
is              -> O
working         -> O
in              -> O
senscript       -> O
technologies    -> O
located         -> O
in              -> O
kochi           -> B-geo

Predicted NER tags:
exit            -> O


In [5]:
import torch

# Load the mappings from the .pt file
mappings = torch.load("label_mappings.pt", map_location="cpu")

print("Keys in mappings:", mappings.keys())

# Extract actual mappings
id2label = mappings.get("id2label")
label2id = mappings.get("label2id")

print("id2label:", type(id2label), "entries:", len(id2label) if id2label else None)
print("label2id:", type(label2id), "entries:", len(label2id) if label2id else None)

# Optional: inspect first few entries
if id2label:
    print("Sample id2label:", dict(list(id2label.items())[:5]))
if label2id:
    print("Sample label2id:", dict(list(label2id.items())[:5]))


Keys in mappings: dict_keys(['id2label', 'label2id'])
id2label: <class 'dict'> entries: 17
label2id: <class 'dict'> entries: 17
Sample id2label: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'}
Sample label2id: {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}


In [2]:
import pickle

with open("label_mappings.pt", "rb") as f:
    mappings = pickle.load(f)

# Handle both dict and tuple/list cases
if isinstance(mappings, dict):
    char2idx = mappings.get("char2idx")
    tag2idx = mappings.get("tag2idx")
    idx2tag = mappings.get("idx2tag") or {v: k for k, v in tag2idx.items()}
elif isinstance(mappings, (tuple, list)):
    if len(mappings) == 3:
        char2idx, tag2idx, idx2tag = mappings
    elif len(mappings) == 2:
        char2idx, tag2idx = mappings
        idx2tag = {v: k for k, v in tag2idx.items()}
    else:
        raise ValueError(f"Unexpected mappings length: {len(mappings)}")
else:
    raise ValueError("Unsupported format in label_mappings.pt")


UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.