# HW4-DL for Name Entity Recognition (NER)

- Detravious Jamari Brinkley
- CSCI-544: Applied Natural Language Processing
- python version: 3

---

- Name Entity Recognition []

In [1]:
# imports
import torch

import numpy as np
import pandas as pd


import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Load Data

In [2]:
data = pd.read_csv("../datasets/ner_practice/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data

  data = data.fillna(method="ffill")


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [3]:
def update_index(df):
    current_col = None
    new_index = 0  # Initial new index
    # empty_row = pd.DataFrame(columns=df.columns)

    for i, row in df.iterrows():
        # Check if column name changes
        if current_col != row['Sentence #']:
            current_col = row['Sentence #']
            new_index = 1  # Reset new index to 1
            
            empty_row = pd.Series([None] * len(df.columns), index=df.columns)
            
        df.at[i, 'New Sentence Index'] = new_index
        new_index += 1  # Increment new index
        
    return df

In [4]:
# Add a new column 'Index' and update it
data['New Sentence Index'] = None
data = update_index(data)
data.drop(columns=["Sentence #", "POS"], inplace=True)
data.rename(columns={'Tag': 'NER Tag'}, inplace=True)

In [5]:
def add_empty_row(df):
    new_rows = []
    for index, row in df.iterrows():
        if row['New Sentence Index'] == 1:
            empty_row = pd.Series([''] * len(row), index=row.index)
            new_rows.append(empty_row)
        new_rows.append(row)
    return pd.DataFrame(new_rows)

In [6]:
data = add_empty_row(data)
data.reset_index(inplace=True)
data.drop(columns=["index"], inplace=True)
data = data.iloc[1:]

In [7]:
data

Unnamed: 0,Word,NER Tag,New Sentence Index
1,Thousands,O,1
2,of,O,2
3,demonstrators,O,3
4,have,O,4
5,marched,O,5
...,...,...,...
1096529,they,O,4
1096530,responded,O,5
1096531,to,O,6
1096532,the,O,7


In [8]:
def get_stats(df, col_name):
    print(f"Unique {col_name} in corpus:", df[col_name].nunique())

    values_in_col_name = list(set(data[col_name].values))[1:]
    len_of_col_name = len(values_in_col_name)

    return len_of_col_name, values_in_col_name

In [9]:
def create_mappings(input_to_map):
    """Pair word with corresponding index"""

    word_to_index = {}

    for idx, word in tqdm(enumerate(input_to_map)):
        # If the word is not already in the dictionary, add it with its index
        # if word not in word_to_index:
        # word_to_index[word] = input_to_map.index(word)
        word_to_index[word] = idx + 1
    return word_to_index


In [10]:
num_words, words = get_stats(data, "Word")
num_words, words 

Unique Word in corpus: 35178


(35177,
 ['rampage',
  '405',
  'pouring',
  'appear',
  'Casamance',
  'incensed',
  'Alzouma',
  'Dayton',
  'snag',
  'Bernanke',
  'grounds',
  'allot',
  'outperformed',
  'Xinhua',
  'boastful',
  'Sarawak',
  'estimated',
  'apprehended',
  'Pondicherry',
  'Carreno',
  'assassins',
  'exemption',
  'realizes',
  'Teir',
  'Kirkuk',
  'sketch',
  'caverns',
  'Scilingo',
  'grip',
  'gridlock',
  'fatigues',
  'Salafist',
  'good-bye',
  'Sadat',
  'war-damaged',
  'involving',
  'Congressman',
  'Cleric',
  'Dalian',
  'notional',
  'Anabel',
  'relied',
  'tamed',
  'Dobrynska',
  'Payne',
  'behaviour',
  'go-betweens',
  'sleeve',
  'arthritis',
  'westward',
  'Blanco',
  'Saparmurat',
  'overburdened',
  'pontificate',
  'Caspian',
  'constraints',
  '170-meter',
  'Fadilah',
  'supervising',
  'Si',
  'uncontrolled',
  'wall',
  'hospitalized',
  'hotel',
  'shafts',
  'Zabul',
  'deteriorates',
  'cancers',
  'Tito',
  'al-Maliky',
  'ringleader',
  'N.J.',
  'Arabiya',


In [34]:
words.append("ENDPAD")
num_words = len(words)
num_words

35178

In [35]:
word2idx = create_mappings(words)
word2idx

35178it [00:00, 1798479.11it/s]


{'rampage': 1,
 '405': 2,
 'pouring': 3,
 'appear': 4,
 'Casamance': 5,
 'incensed': 6,
 'Alzouma': 7,
 'Dayton': 8,
 'snag': 9,
 'Bernanke': 10,
 'grounds': 11,
 'allot': 12,
 'outperformed': 13,
 'Xinhua': 14,
 'boastful': 15,
 'Sarawak': 16,
 'estimated': 17,
 'apprehended': 18,
 'Pondicherry': 19,
 'Carreno': 20,
 'assassins': 21,
 'exemption': 22,
 'realizes': 23,
 'Teir': 24,
 'Kirkuk': 25,
 'sketch': 26,
 'caverns': 27,
 'Scilingo': 28,
 'grip': 29,
 'gridlock': 30,
 'fatigues': 31,
 'Salafist': 32,
 'good-bye': 33,
 'Sadat': 34,
 'war-damaged': 35,
 'involving': 36,
 'Congressman': 37,
 'Cleric': 38,
 'Dalian': 39,
 'notional': 40,
 'Anabel': 41,
 'relied': 42,
 'tamed': 43,
 'Dobrynska': 44,
 'Payne': 45,
 'behaviour': 46,
 'go-betweens': 47,
 'sleeve': 48,
 'arthritis': 49,
 'westward': 50,
 'Blanco': 51,
 'Saparmurat': 52,
 'overburdened': 53,
 'pontificate': 54,
 'Caspian': 55,
 'constraints': 56,
 '170-meter': 57,
 'Fadilah': 58,
 'supervising': 59,
 'Si': 60,
 'uncontroll

In [36]:
num_ner_tags, ner_tags = get_stats(data, "NER Tag")
num_ner_tags, ner_tags 

Unique NER Tag in corpus: 18


(17,
 ['B-geo',
  'I-tim',
  'B-eve',
  'I-gpe',
  'I-geo',
  'I-art',
  'O',
  'I-eve',
  'B-org',
  'B-art',
  'I-nat',
  'B-tim',
  'I-org',
  'B-per',
  'B-nat',
  'B-gpe',
  'I-per'])

In [37]:
tag2idx = create_mappings(ner_tags)
tag2idx

17it [00:00, 236103.21it/s]


{'B-geo': 1,
 'I-tim': 2,
 'B-eve': 3,
 'I-gpe': 4,
 'I-geo': 5,
 'I-art': 6,
 'O': 7,
 'I-eve': 8,
 'B-org': 9,
 'B-art': 10,
 'I-nat': 11,
 'B-tim': 12,
 'I-org': 13,
 'B-per': 14,
 'B-nat': 15,
 'B-gpe': 16,
 'I-per': 17}

In [38]:
def sentence_integrate(df: pd.DataFrame, sentence_col_name: str, word_col_name: str, ner_col_name: str):
    """Map word within the sentence to the associated ner tag
    
    Parameters
    ----------
    df: `pd.DataFrame`
        DataFrame containing sentences with consecutive indices and starting with "Index" = 1
    sentence_col_name
    word_col_name
    ner_col_name

    Returns
    -------
    sentences: `list`
        2D list [# sentences, pairing ((word, ner tag))]
    """
    sentences = []
    sentence_info = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Sentences"):
        if row[sentence_col_name] == 1:
            if len(sentence_info) > 0:  # If there are words in the sentence
                sentences.append(sentence_info)

                sentence_info = []  # Reset for the next sentence
            
        word = row[word_col_name]
        ner_tag = row[ner_col_name]
        sentence_info.append((word, ner_tag))
        
    # Process the last sentence
    if sentence_info:
        sentences.append(sentence_info)

    return sentences


In [39]:
new_sentence_col_name = 'New Sentence Index'
word_col_name = 'Word'
ner_tag_col_name = 'NER Tag'
sentences = sentence_integrate(data, new_sentence_col_name, word_col_name, ner_tag_col_name)


Processing Sentences:   0%|          | 0/1096533 [00:00<?, ?it/s]

Processing Sentences: 100%|██████████| 1096533/1096533 [00:47<00:00, 23133.92it/s]


In [40]:
len(sentences)

47959

In [41]:
sentences

[[('Thousands', 'O'),
  ('of', 'O'),
  ('demonstrators', 'O'),
  ('have', 'O'),
  ('marched', 'O'),
  ('through', 'O'),
  ('London', 'B-geo'),
  ('to', 'O'),
  ('protest', 'O'),
  ('the', 'O'),
  ('war', 'O'),
  ('in', 'O'),
  ('Iraq', 'B-geo'),
  ('and', 'O'),
  ('demand', 'O'),
  ('the', 'O'),
  ('withdrawal', 'O'),
  ('of', 'O'),
  ('British', 'B-gpe'),
  ('troops', 'O'),
  ('from', 'O'),
  ('that', 'O'),
  ('country', 'O'),
  ('.', 'O'),
  ('', '')],
 [('Families', 'O'),
  ('of', 'O'),
  ('soldiers', 'O'),
  ('killed', 'O'),
  ('in', 'O'),
  ('the', 'O'),
  ('conflict', 'O'),
  ('joined', 'O'),
  ('the', 'O'),
  ('protesters', 'O'),
  ('who', 'O'),
  ('carried', 'O'),
  ('banners', 'O'),
  ('with', 'O'),
  ('such', 'O'),
  ('slogans', 'O'),
  ('as', 'O'),
  ('"', 'O'),
  ('Bush', 'B-per'),
  ('Number', 'O'),
  ('One', 'O'),
  ('Terrorist', 'O'),
  ('"', 'O'),
  ('and', 'O'),
  ('"', 'O'),
  ('Stop', 'O'),
  ('the', 'O'),
  ('Bombings', 'O'),
  ('.', 'O'),
  ('"', 'O'),
  ('', '')],

In [42]:
sentences[0]

[('Thousands', 'O'),
 ('of', 'O'),
 ('demonstrators', 'O'),
 ('have', 'O'),
 ('marched', 'O'),
 ('through', 'O'),
 ('London', 'B-geo'),
 ('to', 'O'),
 ('protest', 'O'),
 ('the', 'O'),
 ('war', 'O'),
 ('in', 'O'),
 ('Iraq', 'B-geo'),
 ('and', 'O'),
 ('demand', 'O'),
 ('the', 'O'),
 ('withdrawal', 'O'),
 ('of', 'O'),
 ('British', 'B-gpe'),
 ('troops', 'O'),
 ('from', 'O'),
 ('that', 'O'),
 ('country', 'O'),
 ('.', 'O'),
 ('', '')]

In [43]:
sentences[1]

[('Families', 'O'),
 ('of', 'O'),
 ('soldiers', 'O'),
 ('killed', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('conflict', 'O'),
 ('joined', 'O'),
 ('the', 'O'),
 ('protesters', 'O'),
 ('who', 'O'),
 ('carried', 'O'),
 ('banners', 'O'),
 ('with', 'O'),
 ('such', 'O'),
 ('slogans', 'O'),
 ('as', 'O'),
 ('"', 'O'),
 ('Bush', 'B-per'),
 ('Number', 'O'),
 ('One', 'O'),
 ('Terrorist', 'O'),
 ('"', 'O'),
 ('and', 'O'),
 ('"', 'O'),
 ('Stop', 'O'),
 ('the', 'O'),
 ('Bombings', 'O'),
 ('.', 'O'),
 ('"', 'O'),
 ('', '')]

In [44]:
def pad_sequences(sentences, word_idx, ner_tag_idx, num_words):
    """Pad sequences of sentences to the length of the longest sequence
    """

    X = []
    y = []
    max_len = max(len(sentence) for sentence in sentences) - 1
    print(max_len)
    # max_len = 50

    for sentence in tqdm(sentences):
        word_indices = []
        ner_tag_indices = []
        for word in sentence:
            idx_of_word = word_idx.get(word[0], num_words - 1)
            word_indices.append(idx_of_word)

            idx_of_ner_tag = ner_tag_idx.get(word[1], ner_tag_idx["O"])
            ner_tag_indices.append(idx_of_ner_tag)

        # Padding to the length of the longest sequence
        padded_word_indices = word_indices + [num_words - 1] * (max_len - len(word_indices))
        padded_ner_tag_indices = ner_tag_indices + [ner_tag_idx["O"]] * (max_len - len(ner_tag_indices))

        X.append(torch.tensor(padded_word_indices))
        y.append(torch.tensor(padded_ner_tag_indices))

    X_padded = pad_sequence(X, batch_first=True, padding_value=num_words - 1)
    y_padded = pad_sequence(y, batch_first=True, padding_value=ner_tag_idx["O"])
    
    return X_padded, y_padded


In [45]:
# Example usage:
X_padded, y_padded = pad_sequences(sentences, word2idx, tag2idx, num_words)


104


100%|██████████| 47959/47959 [00:02<00:00, 21202.35it/s]


In [46]:
X_padded

tensor([[34083, 11431, 23012,  ..., 35177, 35177, 35177],
        [29690, 11431, 18666,  ..., 35177, 35177, 35177],
        [29233, 34823, 29919,  ..., 35177, 35177, 35177],
        ...,
        [20640, 15242, 32800,  ..., 35177, 35177, 35177],
        [29233, 11419, 16632,  ..., 35177, 35177, 35177],
        [ 4164, 11120, 15633,  ..., 35177, 35177, 35177]])

In [47]:
y_padded

tensor([[ 7,  7,  7,  ...,  7,  7,  7],
        [ 7,  7,  7,  ...,  7,  7,  7],
        [ 7,  7,  7,  ...,  7,  7,  7],
        ...,
        [ 7,  7,  7,  ...,  7,  7,  7],
        [ 7,  7,  7,  ...,  7,  7,  7],
        [16,  7,  7,  ...,  7,  7,  7]])

# Outline of Tasks

1. Simple Bidirectional LSTM model
2. Using GloVe word embeddings
3. BONUS: LSTM-CNN model


# 1. Simple Bidirectional LSTM model

- LSTM embedding should be per sentence
- I currently have each embedding per word

In [70]:
def split_train_test_data(X, y, train_size: float):
    test_size = 1 - train_size
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    x_train_tensor = torch.tensor(x_train)
    x_test_tensor = torch.tensor(x_test)

    y_train_tensor = torch.tensor(y_train)
    y_test_tensor = torch.tensor(y_test)

    return x_train_tensor, x_test_tensor, y_train_tensor, y_test_tensor


In [71]:
train_size = 0.8
x_train, x_test, y_train, y_test = split_train_test_data(X_padded, y_padded, train_size)

  x_train_tensor = torch.tensor(x_train)
  x_test_tensor = torch.tensor(x_test)
  y_train_tensor = torch.tensor(y_train)
  y_test_tensor = torch.tensor(y_test)


In [72]:
len(x_train), len(x_test), len(y_train), len(y_test)

(38367, 9592, 38367, 9592)

## Define BLSTM

In [None]:
# Define your model
class BiLSTM(nn.Module):
    def __name__(self):
        return "BiLSTM"
    
    def __init__(self, num_words, embedding_dim, hidden_dim, output_dim, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(num_words, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out

In [74]:
embedding_dim = 100
hidden_dim = 256
output_dim = 105
dropout = 0.33

model = BiLSTM(num_words, embedding_dim, hidden_dim, output_dim, dropout)

In [75]:
# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
batch_size = 32 

# Initialize DataLoader for batching
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [77]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(x_test)
        val_loss = criterion(val_outputs, y_test)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

100%|██████████| 1199/1199 [02:07<00:00,  9.42it/s]


Epoch [1/1], Loss: 0.1777, Val Loss: 0.1746


In [65]:
# Evaluate the model on test data
with torch.no_grad():
    outputs = model(x_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
    print('Test Accuracy: {:.7f}%'.format(accuracy))

# Randomly select a sentence for prediction
i = np.random.randint(0, len(x_test))
print("This is sentence:", i)
with torch.no_grad():
    output = model(x_test_tensor[i].unsqueeze(0))
    _, predicted = torch.max(output, 2)
    predicted = predicted.squeeze().numpy()

Test Accuracy: 101.6607590%
This is sentence: 4069


In [66]:
print("{:7}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" * 30)
for w, true, pred in zip(x_test[i], y_test[i], predicted):
    print("{:15}{}\t{}".format(words[w], ner_tags[true], ner_tags[pred]))

Word   True 	 Pred

------------------------------
anti-Bush      I-eve	B-art
Jiang          I-eve	I-eve
64th           I-eve	I-eve


IndexError: list index out of range