## Data Cleaning

In [11]:
import pandas as pd

# Load the CSV file (adjust encoding if needed)
df = pd.read_csv(r"C:\Users\Tian\Desktop\NLP_Task7\Datasets\NER dataset.csv", encoding='ISO-8859-1')

# Step 1: Fill missing 'Sentence #' by forward filling
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

# Step 2: Drop any rows with missing values in required columns
df = df.dropna(subset=['Word', 'POS', 'Tag'])

# Step 3: Strip whitespace from strings in all relevant columns
df['Word'] = df['Word'].str.strip()
df['POS'] = df['POS'].str.strip()
df['Tag'] = df['Tag'].str.strip()

# Step 4: Preview the cleaned dataset
print("✅ Cleaned dataset shape:", df.shape)
print(df.head(10))

# Optional: Save cleaned data to new CSV
df.to_csv(r"C:\Users\Tian\Desktop\NLP_Task7\NER Recog\cleaned_NER_dataset.csv", index=False)


  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


✅ Cleaned dataset shape: (1048565, 4)
    Sentence #           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1  Sentence: 1             of   IN      O
2  Sentence: 1  demonstrators  NNS      O
3  Sentence: 1           have  VBP      O
4  Sentence: 1        marched  VBN      O
5  Sentence: 1        through   IN      O
6  Sentence: 1         London  NNP  B-geo
7  Sentence: 1             to   TO      O
8  Sentence: 1        protest   VB      O
9  Sentence: 1            the   DT      O


In [12]:
import pandas as pd

# Load the cleaned CSV (already cleaned based on your previous code)
df = pd.read_csv(r"C:\Users\Tian\Desktop\NLP_Task7\NER Recog\cleaned_NER_dataset.csv")

# Group by sentence
grouped = df.groupby("Sentence #").agg(list)

# Extract sequences
sentences = grouped['Word'].tolist()
ner_tags = grouped['Tag'].tolist()


In [13]:
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }

    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

def extract_features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


In [14]:
# Ensure all elements in sentences are valid strings
cleaned_sentences = [[str(word) if not isinstance(word, str) else word for word in sentence] for sentence in sentences]

X = [extract_features(s) for s in cleaned_sentences]  # List of list of dicts
y = ner_tags  # List of list of tags (already done)

In [15]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X, y)

In [16]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X)

print(metrics.flat_classification_report(y, y_pred))


              precision    recall  f1-score   support

       B-art       0.92      0.66      0.77       402
       B-eve       0.85      0.70      0.77       308
       B-geo       0.91      0.95      0.93     37644
       B-gpe       0.98      0.95      0.96     15870
       B-nat       0.92      0.60      0.73       201
       B-org       0.91      0.85      0.88     20143
       B-per       0.95      0.92      0.93     16990
       B-tim       0.96      0.92      0.94     20333
       I-art       0.93      0.76      0.84       297
       I-eve       0.86      0.70      0.77       253
       I-geo       0.91      0.92      0.91      7414
       I-gpe       0.95      0.69      0.80       198
       I-nat       0.90      0.69      0.78        51
       I-org       0.93      0.93      0.93     16784
       I-per       0.94      0.96      0.95     17251
       I-tim       0.93      0.89      0.91      6528
           O       1.00      1.00      1.00    887898

    accuracy              

In [17]:
from collections import Counter

# Build word and tag vocabularies
all_words = [word for sentence in sentences for word in sentence]
all_tags = [tag for tag_list in ner_tags for tag in tag_list]

word_vocab = {word: i + 2 for i, (word, _) in enumerate(Counter(all_words).items())}
tag_vocab = {tag: i for i, tag in enumerate(set(all_tags))}
tag_vocab['<PAD>'] = len(tag_vocab)

# Add special tokens
word_vocab['<PAD>'] = 0
word_vocab['<UNK>'] = 1

# Reverse vocab for later use
reverse_word_vocab = {v: k for k, v in word_vocab.items()}
reverse_tag_vocab = {v: k for k, v in tag_vocab.items()}


In [18]:
def sentence_to_indices(sentence, word_vocab):
    return [word_vocab.get(word, word_vocab['<UNK>']) for word in sentence]

def tags_to_indices(tags, tag_vocab):
    return [tag_vocab[tag] for tag in tags]

X_data = [sentence_to_indices(sentence, word_vocab) for sentence in sentences]
y_data = [tags_to_indices(tags, tag_vocab) for tags in ner_tags]


In [22]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F

X_data_padded = pad_sequence([torch.tensor(sentence) for sentence in X_data], batch_first=True, padding_value=word_vocab['<PAD>'])
y_data_padded = pad_sequence([torch.tensor(tags) for tags in y_data], batch_first=True, padding_value=tag_vocab['<PAD>'])


In [23]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super(RNN_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_vocab['<PAD>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, x, lengths):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        # ✅ Pack padded sequence (lengths must be on CPU!)
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed)
        output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=x.size(1))  # ✅ Correct unpack
        logits = self.fc(output)
        return logits


In [24]:
model = RNN_NER(vocab_size=len(word_vocab), tagset_size=len(tag_vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=tag_vocab['<PAD>'])


In [25]:
epochs = 5
model.train()

# Make sure lengths are calculated from padded input
lengths = [torch.count_nonzero(seq != word_vocab['<PAD>']).item() for seq in X_data_padded]

for epoch in range(epochs):
    total_loss = 0
    for i in range(len(X_data_padded)):
        inputs = X_data_padded[i].unsqueeze(0)  # (1, seq_len)
        targets = y_data_padded[i].unsqueeze(0)  # (1, seq_len)
        seq_len = torch.tensor([lengths[i]])

        optimizer.zero_grad()
        outputs = model(inputs, seq_len)
        loss = loss_fn(outputs.view(-1, len(tag_vocab)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")


Epoch 1/5, Loss: 10039.8118
Epoch 2/5, Loss: 6138.1237
Epoch 3/5, Loss: 5343.0849
Epoch 4/5, Loss: 4934.2215
Epoch 5/5, Loss: 4705.9500
