## <center> ID5002W:Industrial AI Laboratory
### <center> Week-12 (part-2)
### <center> Recurrent Neural Networks

**Contents:** 
1. Data preprocessing
2. Creating tensor dataset
3. Defining and training RNN model 
4. Test dataset performance evaluation


In [None]:
# !pip install torchmetrics

In [2]:
# Importing necessary libraries
import matplotlib.pyplot as plt # For data visulization
import pandas as pd # For data import and manipulation
import numpy as np # For linear algebra
from sklearn.model_selection import train_test_split # For training and validating

import torch
import torch.optim as optim # To choose the optimizer
from torch.autograd import Variable # To define the hidden state in the RNN to be a variable
import torch.nn as nn # To build the neural network layers
import torch.nn.functional as F # To use non-linear function (Eg: Relu)
from torch.utils.data import DataLoader, TensorDataset
from torchmetrics import Accuracy

# Data pre-processing

# Data source: https://www.kaggle.com/datasets/namanj27/ner-dataset

In [3]:
# Loading the dataset
df = pd.read_csv('ner_datasetreference.csv',encoding='latin1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
n_sentences = np.sum(df['Sentence #'].isna()==0) # total number of sentences
idx = np.where(df['Sentence #'].isna().values==0) # index of first word in a sentence
# Sentences is a list of lists (inner list will be the words in a sentence; outer list will be all the sentences)
# entities is a list of lists (inner list will be the tags in a sentence; outer list will be all the sentences)
sentences=[]
entities=[]
for i,j in enumerate(idx[0]):
  if j==idx[0][-1]:
    sentence = [k for k in df['Word'][j:]]
    entity = [k for k in df['Tag'][j:]]
  else:
    sentence = [k for k in df['Word'][j:idx[0][i+1]]]
    entity = [k for k in df['Tag'][j:idx[0][i+1]]]
  sentences.append(sentence)
  entities.append(entity)

In [9]:
# Creating a dictionary that carry word to index and index to word info
uniq_words = np.sort(df['Word'].unique()) # all unique words
uniq_words = np.concatenate((uniq_words,np.array(['end_pad']))) # appending with end tag

word_2_idx = {j:i for i,j in enumerate(uniq_words)}
idx_2_word = {i:j for i,j in enumerate(uniq_words)}

In [10]:
# Creating a dictionary that carry entity to index and index to entity info
uniq_tags = np.sort(df['Tag'].unique()) # all unique tags
uniq_tags = np.concatenate((uniq_tags,np.array(['end_pad']))) # appeding with end tag

tag_2_idx = {j:i for i,j in enumerate(uniq_tags)}
idx_2_tag = {i:j for i,j in enumerate(uniq_tags)}

In [17]:
# replacing the words and tags with the indices
X,y = [],[]
for sent,ent in zip(sentences,entities):
  item1 = [word_2_idx[i] for i in sent]
  item2 = [tag_2_idx[i] for i in ent]
  X.append(item1)
  y.append(item2)

In [20]:
X = [torch.tensor(i) for i in X] # Converting the list to tensor
X = nn.utils.rnn.pad_sequence(X,batch_first =True, padding_value=word_2_idx['end_pad']) # Padding all the sentences with end pad
print(X.shape)

torch.Size([47959, 104])


In [22]:
y = [torch.tensor(i) for i in y] # Converting the list to tensor
y = nn.utils.rnn.pad_sequence(y,batch_first =True, padding_value=tag_2_idx['end_pad']) # Padding all the sentences with end pad
print(y.shape)

torch.Size([47959, 104])


# Summary of data preprocessing

1. Created a variable 'X' of shape --> (number of sentences, max_length)
2. Created a variable 'y' of shape --> (number of sentences, max_length, number of entities)
3. Created variables 'word_2_idx' and 'idx_2_word' that maps word to index and index to word respectively
4. Created variables 'tag_2_idx' and 'idx_2_tag' that maps entities to index and index to entities respectively

# Creating tensor dataset

In [23]:
# train test split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.2,random_state = 42) 

In [24]:
# Pytorch train and test sets
train = TensorDataset(Xtrain,ytrain)
test = TensorDataset(Xtest,ytest)

batch_size =16

# data loader
train_loader = DataLoader(train, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test, batch_size = batch_size, shuffle = True)

# Defining and training RNN model 

In [25]:
# Defining the model
class RNN_model(nn.Module):
  def __init__(self,n_words,embedding_size,padding_idx,output_size,hidden_dim,n_rnn_layers):
    super().__init__()
    # RNN layers
    self.emb_1 = nn.Embedding(num_embeddings=n_words, embedding_dim=embedding_size, padding_idx=padding_idx)
    self.rnn_2 = nn.RNN(input_size=embedding_size, hidden_size=hidden_dim, num_layers=n_rnn_layers, nonlinearity='relu',batch_first=True) 
    self.lin_3 = nn.Linear(hidden_dim, output_size)
    
  def forward(self, x):
    output = self.emb_1(x)
    output, hs = self.rnn_2(output)
    output = self.lin_3(output)
    return output

In [26]:
# number of words
n_words = len(word_2_idx.keys())
embedding_size = 50
padding_idx = word_2_idx['end_pad']
output_size = 18
hidden_dim = 50
n_rnn_layers =2

# Check if GPU is available so that the model can be trained on a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initializing the model
model = RNN_model(n_words,embedding_size,padding_idx,output_size,hidden_dim,n_rnn_layers)

model.to(device)

RNN_model(
  (emb_1): Embedding(35179, 50, padding_idx=35178)
  (rnn_2): RNN(50, 50, num_layers=2, batch_first=True)
  (lin_3): Linear(in_features=50, out_features=18, bias=True)
)

In [27]:
# defining the loss and the optimizer
loss_metric = nn.CrossEntropyLoss(ignore_index=padding_idx)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
# Training the model
for epoch in range(5): # Number of epochs
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device) # to gpu
        optimizer.zero_grad() # zero the parameter gradients
        outputs = model(inputs) # Forward pass
        loss = loss_metric(outputs.view(-1,output_size), labels.view(-1)) # Cross entropy loss
        loss.backward() # backward pass
        optimizer.step() # Optimizing the parameters
        running_loss += loss.item() # calculating the loss
    print(f'For epoch {epoch+1}: The loss is:{running_loss/i}')
    running_loss = 0.0

For epoch 1: The loss is:0.11399002670526853
For epoch 2: The loss is:0.0407255743737078
For epoch 3: The loss is:0.03147547689651543
For epoch 4: The loss is:0.02677877865456632
For epoch 5: The loss is:0.02379323736308737


# Test dataset performance evaluation

In [31]:
accuracy = Accuracy(task='multiclass', num_classes=18,ignore_index=tag_2_idx['end_pad']).to(device)
# Test accuracy
acc= []
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        acc_metric = accuracy(outputs.view(-1,output_size), labels.view(-1))
        acc.append(acc_metric)
print('The accuracy of the model in test dataset is',torch.mean(torch.tensor(acc)).item())

The accuracy of the model in test dataset is 0.9567702412605286
