## RNN Model

In [16]:
import pandas as pd

# Load the CSV file (adjust encoding if needed)
df = pd.read_csv(r"C:\Users\Tian\Desktop\NLP_Task7\Datasets\NER dataset.csv", encoding='ISO-8859-1')

# Step 1: Fill missing 'Sentence #' by forward filling
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

# Step 2: Drop any rows with missing values in required columns
df = df.dropna(subset=['Word', 'POS', 'Tag'])

# Step 3: Strip whitespace from strings in all relevant columns
df['Word'] = df['Word'].str.strip()
df['POS'] = df['POS'].str.strip()
df['Tag'] = df['Tag'].str.strip()

# Step 4: Preview the cleaned dataset
print("✅ Cleaned dataset shape:", df.shape)
print(df.head(10))

# Optional: Save cleaned data to new CSV
df.to_csv(r"C:\Users\Tian\Desktop\NLP_Task7\NER_Recog\cleaned_NER_dataset.csv", index=False)


  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


✅ Cleaned dataset shape: (1048565, 4)
    Sentence #           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1  Sentence: 1             of   IN      O
2  Sentence: 1  demonstrators  NNS      O
3  Sentence: 1           have  VBP      O
4  Sentence: 1        marched  VBN      O
5  Sentence: 1        through   IN      O
6  Sentence: 1         London  NNP  B-geo
7  Sentence: 1             to   TO      O
8  Sentence: 1        protest   VB      O
9  Sentence: 1            the   DT      O


In [17]:
import pandas as pd

# Load the cleaned CSV (already cleaned based on your previous code)
df = pd.read_csv(r"C:\Users\Tian\Desktop\NLP_Task7\NER_Recog\cleaned_NER_dataset.csv")

# Group by sentence
grouped = df.groupby("Sentence #").agg(list)

# Extract sequences
sentences = grouped['Word'].tolist()
ner_tags = grouped['Tag'].tolist()


In [18]:
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }

    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

def extract_features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


In [19]:
# Ensure all elements in sentences are valid strings
cleaned_sentences = [[str(word) if not isinstance(word, str) else word for word in sentence] for sentence in sentences]

X = [extract_features(s) for s in cleaned_sentences]  # List of list of dicts
y = ner_tags  # List of list of tags (already done)

In [20]:
# import sklearn_crfsuite

# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     c1=0.1,
#     c2=0.1,
#     max_iterations=100,
#     all_possible_transitions=True
# )

# crf.fit(X, y)

In [21]:
# from sklearn_crfsuite import metrics

# y_pred = crf.predict(X)

# print(metrics.flat_classification_report(y, y_pred))


In [22]:
from collections import Counter

# Build word and tag vocabularies
all_words = [word for sentence in sentences for word in sentence]
all_tags = [tag for tag_list in ner_tags for tag in tag_list]

word_vocab = {word: i + 2 for i, (word, _) in enumerate(Counter(all_words).items())}
tag_vocab = {tag: i for i, tag in enumerate(set(all_tags))}
tag_vocab['<PAD>'] = len(tag_vocab)

# Add special tokens
word_vocab['<PAD>'] = 0
word_vocab['<UNK>'] = 1

# Reverse vocab for later use
reverse_word_vocab = {v: k for k, v in word_vocab.items()}
reverse_tag_vocab = {v: k for k, v in tag_vocab.items()}


In [23]:
def sentence_to_indices(sentence, word_vocab):
    return [word_vocab.get(word, word_vocab['<UNK>']) for word in sentence]

def tags_to_indices(tags, tag_vocab):
    return [tag_vocab[tag] for tag in tags]

X_data = [sentence_to_indices(sentence, word_vocab) for sentence in sentences]
y_data = [tags_to_indices(tags, tag_vocab) for tags in ner_tags]


In [24]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F

X_data_padded = pad_sequence([torch.tensor(sentence) for sentence in X_data], batch_first=True, padding_value=word_vocab['<PAD>'])
y_data_padded = pad_sequence([torch.tensor(tags) for tags in y_data], batch_first=True, padding_value=tag_vocab['<PAD>'])

In [25]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super(RNN_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_vocab['<PAD>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, x, lengths):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        # ✅ Pack padded sequence (lengths must be on CPU!)
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed)
        output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=x.size(1))  # ✅ Correct unpack
        logits = self.fc(output)
        return logits


In [26]:
model = RNN_NER(vocab_size=len(word_vocab), tagset_size=len(tag_vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=tag_vocab['<PAD>'])

In [27]:
import time

epochs = 5
model.train()

# Make sure lengths are calculated from padded input
lengths = [torch.count_nonzero(seq != word_vocab['<PAD>']).item() for seq in X_data_padded]

for epoch in range(epochs):
    start_time = time.time()
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    for i in range(len(X_data_padded)):
        inputs = X_data_padded[i].unsqueeze(0)  # (1, seq_len)
        targets = y_data_padded[i].unsqueeze(0)  # (1, seq_len)
        seq_len = torch.tensor([lengths[i]])

        optimizer.zero_grad()
        outputs = model(inputs, seq_len)  # (1, seq_len, num_tags)

        outputs_flat = outputs.view(-1, len(tag_vocab))
        targets_flat = targets.view(-1)

        loss = loss_fn(outputs_flat, targets_flat)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Accuracy calculation
        predictions = torch.argmax(outputs, dim=-1)  # (1, seq_len)
        mask = targets != tag_vocab['<PAD>']
        correct = (predictions == targets) & mask
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()

    epoch_accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    elapsed_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Time: {elapsed_time:.2f}s") 

Epoch 1/5, Loss: 9992.3239, Accuracy: 0.9410, Time: 787.60s
Epoch 2/5, Loss: 6137.5454, Accuracy: 0.9611, Time: 778.81s
Epoch 3/5, Loss: 5360.0123, Accuracy: 0.9654, Time: 782.75s
Epoch 4/5, Loss: 4949.4424, Accuracy: 0.9680, Time: 770.17s
Epoch 5/5, Loss: 4725.9290, Accuracy: 0.9693, Time: 755.59s


In [28]:
from sklearn.metrics import classification_report
import numpy as np

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for i in range(len(X_data_padded)):
        inputs = X_data_padded[i].unsqueeze(0)  # (1, seq_len)
        targets = y_data_padded[i].unsqueeze(0)  # (1, seq_len)
        seq_len = torch.tensor([lengths[i]])

        outputs = model(inputs, seq_len)
        predictions = torch.argmax(outputs, dim=-1)  # (1, seq_len)

        pred = predictions.squeeze(0).tolist()
        label = targets.squeeze(0).tolist()

        for p, l in zip(pred, label):
            if l != tag_vocab['<PAD>']:
                all_preds.append(p)
                all_labels.append(l)

# Convert indices back to tag names
all_preds_tags = [reverse_tag_vocab[i] for i in all_preds]
all_labels_tags = [reverse_tag_vocab[i] for i in all_labels]

# Evaluation report
print("Classification Report:")
print(classification_report(all_labels_tags, all_preds_tags, digits=4))


Classification Report:
              precision    recall  f1-score   support

       B-art     0.2132    0.1045    0.1402       402
       B-eve     0.7043    0.2630    0.3830       308
       B-geo     0.8398    0.9281    0.8818     37644
       B-gpe     0.9734    0.9270    0.9497     15870
       B-nat     0.7079    0.3134    0.4345       201
       B-org     0.7874    0.7232    0.7540     20143
       B-per     0.8483    0.8435    0.8459     16990
       B-tim     0.9374    0.8649    0.8997     20333
       I-art     0.1031    0.0774    0.0885       297
       I-eve     0.5909    0.1028    0.1751       253
       I-geo     0.7704    0.8183    0.7936      7414
       I-gpe     0.9623    0.5152    0.6711       198
       I-nat     0.8000    0.3137    0.4507        51
       I-org     0.8293    0.7914    0.8099     16784
       I-per     0.9182    0.8329    0.8735     17251
       I-tim     0.7830    0.8194    0.8008      6528
           O     0.9910    0.9939    0.9925    887898

   