<a href="https://colab.research.google.com/github/AnirudhVIyer/NLP/blob/main/POS_lstm_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create an LSTM based seq-seq labelling POS model.

We Will use pre-trained glove embeddings with dimension=100

In [1]:
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as Func # All functions that don't have any parameters
from torch.utils.data import DataLoader # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice and easy way
import torchvision.transforms as transforms # Transformations we can perform

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

--2023-02-04 15:58:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-02-04 15:58:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-02-04 15:58:55--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [4]:
vocab,embeddings = [],[]
with open('glove.6B.100d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [5]:
import numpy as np
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']
(400002, 100)


In [None]:
with open('vocab_np_100.npy','wb') as f:
    np.save(f,vocab_npa)

with open('embeddings_np_100.npy','wb') as f:
    np.save(f,embs_npa)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -av '/content/embeddings_np_100.npy' '/content/drive/MyDrive'
!cp -av '/content/vocab_np_100.npy' '/content/drive/MyDrive'

'/content/embeddings_np_100.npy' -> '/content/drive/MyDrive/embeddings_np_100.npy'
'/content/vocab_np_100.npy' -> '/content/drive/MyDrive/vocab_np_100.npy'


In [6]:
import torch
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 100])


Here all unknown words will have the same embeddings in the pre-trained layer
This might effect the POS tags if those words are not present in the glove set

Later train an embedding layer to check effect

In [7]:
import torch
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())

assert my_embedding_layer.weight.shape == embs_npa.shape
print(my_embedding_layer.weight.shape)

torch.Size([400002, 100])


In [8]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/ner_dataset.csv',encoding='latin-1')
print(type(df))
## each row will take sentence name from predecessor
df = df.ffill(axis=0)
df.head()

##unique tags before grouping
unique_POS = df['POS'].unique()
#df_new = df.groupby('Sentence #')['Word'].apply(list)
df_new = df.groupby("Sentence #").agg(list)
df_new = pd.DataFrame(df_new)
df_new.head()


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [9]:
df_new.sample(10)

Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 46576,"[U.S., Defense, Secretary, Donald, Rumsfeld, s...","[NNP, NNP, NNP, NNP, NNP, VBD, NNP, DT, NNP, N...","[B-org, I-org, B-per, I-per, I-per, O, B-tim, ..."
Sentence: 12745,"[Church, and, state, leaders, have, joined, or...","[NN, CC, NN, NNS, VBP, VBN, JJ, NNS, IN, NNP, ...","[O, O, O, O, O, O, O, O, O, B-geo, O, O, B-geo..."
Sentence: 2615,"[The, Israeli, army, says, troops, raided, the...","[DT, JJ, NN, VBZ, NNS, VBD, DT, NNP, NNP, NN, ...","[O, B-gpe, O, O, O, O, O, B-geo, I-geo, O, O, ..."
Sentence: 22551,"[Australia, is, making, its, first, World, Cup...","[NNP, VBZ, VBG, PRP$, JJ, NNP, NNP, NN, IN, CD...","[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 37134,"[They, crossed, the, Kowal, with, a, Muskie, a...","[PRP, VBD, DT, NNP, IN, DT, NN, CC, VBD, PRP, ...","[O, O, O, B-org, O, O, B-org, O, O, O, O, B-ge..."
Sentence: 32058,"[The, president, 's, action, followed, a, parl...","[DT, NN, POS, NN, VBD, DT, JJ, NN, VBG, NNP, N...","[O, O, O, O, O, O, O, O, O, B-per, I-per, O, O..."
Sentence: 40779,"[Hurricane, Dean, struck, the, island, in, Aug...","[NN, NNP, VBD, DT, NN, IN, NNP, CD, VBG, NNS, ...","[B-eve, I-eve, O, O, O, O, B-tim, I-tim, O, O,..."
Sentence: 40370,"[Al-Manar, headquarters, in, southern, Lebanon...","[NNP, NN, IN, JJ, NNP, VBP, VBN, VBN, IN, DT, ...","[B-org, O, O, B-geo, I-geo, O, O, O, O, O, O, ..."
Sentence: 25131,"[Earlier, U.S., officials, said, Washington, p...","[RBR, NNP, NNS, VBD, NNP, VBZ, TO, VB, NN, IN,...","[O, B-geo, O, O, B-geo, O, O, O, O, O, O, O, O..."
Sentence: 42883,"[President, Bush, has, asked, the, business, l...","[NNP, NNP, VBZ, VBN, DT, NN, NNS, TO, VB, NNP,...","[B-per, I-per, O, O, O, O, O, O, O, B-geo, O, ..."


In [10]:
datasets_new = df_new[['Word','POS']].copy()
datasets_new.reset_index(drop=True,inplace=True)
datasets_new.head()
data = datasets_new.values.tolist()
data[1]

[['Iranian',
  'officials',
  'say',
  'they',
  'expect',
  'to',
  'get',
  'access',
  'to',
  'sealed',
  'sensitive',
  'parts',
  'of',
  'the',
  'plant',
  'Wednesday',
  ',',
  'after',
  'an',
  'IAEA',
  'surveillance',
  'system',
  'begins',
  'functioning',
  '.'],
 ['JJ',
  'NNS',
  'VBP',
  'PRP',
  'VBP',
  'TO',
  'VB',
  'NN',
  'TO',
  'JJ',
  'JJ',
  'NNS',
  'IN',
  'DT',
  'NN',
  'NNP',
  ',',
  'IN',
  'DT',
  'NNP',
  'NN',
  'NN',
  'VBZ',
  'VBG',
  '.']]

In [11]:
## create a custom dataloader and dataset later
class TaggerNet(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,pre_embeds):
        
        # Required call to the constructor of the parent class
        super(TaggerNet, self).__init__()
        
        # Dimension of word embeddings, and the LSTM's hidden state vector
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        # Embedding layer to turn our vocab into dense vectors, use pre-trained embeddings
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        #elf.embeddings = pre_embeds
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # Fully-connected layer that we'll use for prediction
        self.fc = nn.Linear(hidden_dim, tagset_size)
        
        
    def forward(self, sentence):
        embeddings = self.embeddings(sentence)
        
        lstm_output, hidden = self.lstm(embeddings.view(len(sentence), 1, -1))
        
        raw_scores = self.fc(lstm_output.view(len(sentence), -1))
        
        tag_scores = nn.functional.log_softmax(raw_scores, dim=1)
        
        return tag_scores

In [15]:

# def prepare_sequence(seq, to_ix):
#     idxs = [to_ix[w] for w in seq]
#     return torch.tensor(idxs, dtype=torch.long)

def prepare_sequence(seq, to_ix):
  idxs = []
  for w in seq:
    try:
      idxs.append(to_ix[w])
    except:
      idxs.append(0)

  return torch.tensor(idxs, dtype=torch.long)



training_data = data[0:20000]


            
tag_to_ix = {}

for ix, tag in enumerate(unique_POS):
    tag_to_ix[tag] = ix

## create a word_to_index 
word_to_ix = {}
for sent, tags in data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

    
EMBEDDING_DIM = 100
HIDDEN_DIM = 256

In [16]:
print(word_to_ix['there'])
print(tag_to_ix)


1107
{'NNS': 0, 'IN': 1, 'VBP': 2, 'VBN': 3, 'NNP': 4, 'TO': 5, 'VB': 6, 'DT': 7, 'NN': 8, 'CC': 9, 'JJ': 10, '.': 11, 'VBD': 12, 'WP': 13, '``': 14, 'CD': 15, 'PRP': 16, 'VBZ': 17, 'POS': 18, 'VBG': 19, 'RB': 20, ',': 21, 'WRB': 22, 'PRP$': 23, 'MD': 24, 'WDT': 25, 'JJR': 26, ':': 27, 'JJS': 28, 'WP$': 29, 'RP': 30, 'PDT': 31, 'NNPS': 32, 'EX': 33, 'RBS': 34, 'LRB': 35, 'RRB': 36, '$': 37, 'RBR': 38, ';': 39, 'UH': 40, 'FW': 41}


In [17]:
my_model = TaggerNet(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix),my_embedding_layer)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(my_model.parameters(), lr=0.1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
num_epochs = 5
# Move the model to GPU if we can
my_model.to(device)

# Training Loop
for epoch in range(num_epochs):
    
    # Put model into training mode
    my_model.train()
    
    for sentence, tags in training_data:
        # Clear gradient
        my_model.zero_grad()
        
        
        # Prepare the sentence for network input
        input_sentence = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        
        # Move the data over to the GPU
        input_sentence = input_sentence.to(device)
        targets = targets.to(device)

        # Run the forward pass
        tag_scores = my_model(input_sentence)
        
        # Calculate the loss
        loss = loss_function(tag_scores, targets)
        
        # Backward pass
        loss.backward()
        
        # Update model parameters
        optimizer.step()

In [29]:
torch.save(my_model.state_dict(), 'model_POS.pt')

In [19]:
# test with words out of training data range
N = 21100
my_model.eval()

inputs = prepare_sequence(data[N][0], word_to_ix)
inputs = inputs.to(device)

tag_scores = my_model(inputs)

ix_to_tag = dict((v, k) for k, v in tag_to_ix.items())

preds = [torch.max(x, 0)[1].item() for x in tag_scores]
correct = prepare_sequence(data[N][1], tag_to_ix)

original_sentence = data[N][0]
correct_tags = [ix_to_tag[c.item()] for c in correct]
predicted_tags = [ix_to_tag[p] for p in preds]

print('{:<15}|{:<15}|{:<15}\n'.format(*['Original', 'Correct', 'Predicted']))

for item in zip(original_sentence, correct_tags, predicted_tags):
    print('{:<15}|{:<15}|{:<15}'.format(*item))

Original       |Correct        |Predicted      

Competitive    |JJ             |NNP            
diving         |NN             |NNP            
is             |VBZ            |VBZ            
one            |CD             |CD             
of             |IN             |IN             
the            |DT             |DT             
sports         |NNS            |NNS            
guaranteed     |VBN            |NN             
to             |TO             |TO             
attract        |VB             |VB             
a              |DT             |DT             
large          |JJ             |JJ             
audience       |NN             |NN             
at             |IN             |IN             
the            |DT             |DT             
Summer         |NNP            |NN             
Olympics       |NNPS           |NNPS           
in             |IN             |IN             
Beijing        |NNP            |NNP            
.              |.              |.      

In [20]:
test_data = data[20000:]
len(test_data)

27959

#Create API

In [83]:
# my_model.eval()
# sent = 'My name is Anirudh Iyer, I live in India'.split(' ')
# inputs = prepare_sequence(sent, word_to_ix)

# inputs = inputs.to(device)

# tag_scores = my_model(inputs)

# ix_to_tag = dict((v, k) for k, v in tag_to_ix.items())

# preds = [torch.max(x, 0)[1].item() for x in tag_scores]
# correct = prepare_sequence(data[N][1], tag_to_ix)

# predicted_tags = [ix_to_tag[p] for p in preds]


# print(sent)
# print(correct)
# print(preds)
# print(predicted_tags)


def Tag_API(sentence):
  my_model.eval()
  sent = sentence.split(' ')
  new_inp = prepare_sequence(sent, word_to_ix)
  new_inp = new_inp.to(device)
  tag_scores = my_model(new_inp)
  preds = [torch.max(x, 0)[1].item() for x in tag_scores]
  predicted_tags = [ix_to_tag[p] for p in preds]
  print(sentence)
  print(predicted_tags)
  return predicted_tags


Tag_API("Hello, my name is Anirudh Iyer")


Hello, my name is Anirudh Iyer
['NNS', 'PRP$', 'NNP', 'VBZ', 'NNS', 'NNS']


['NNS', 'PRP$', 'NNP', 'VBZ', 'NNS', 'NNS']

In [45]:
print(ix_to_tag)

{0: 'NNS', 1: 'IN', 2: 'VBP', 3: 'VBN', 4: 'NNP', 5: 'TO', 6: 'VB', 7: 'DT', 8: 'NN', 9: 'CC', 10: 'JJ', 11: '.', 12: 'VBD', 13: 'WP', 14: '``', 15: 'CD', 16: 'PRP', 17: 'VBZ', 18: 'POS', 19: 'VBG', 20: 'RB', 21: ',', 22: 'WRB', 23: 'PRP$', 24: 'MD', 25: 'WDT', 26: 'JJR', 27: ':', 28: 'JJS', 29: 'WP$', 30: 'RP', 31: 'PDT', 32: 'NNPS', 33: 'EX', 34: 'RBS', 35: 'LRB', 36: 'RRB', 37: '$', 38: 'RBR', 39: ';', 40: 'UH', 41: 'FW'}


# Evaluate Test Data

In [76]:

def eval(my_model,test,device=device):
  print(len(test))
  my_model.eval()
  correct_pred = 0
  total_pred = 0
  counter = 0
  for sent in test:
    counter = counter + 1
    inputs = prepare_sequence(sent[0], word_to_ix)
    inputs = inputs.to(device)
    tag_scores = my_model(inputs)
    preds = [torch.max(x, 0)[1].item() for x in tag_scores]
    predicted_tags = [ix_to_tag[p] for p in preds]
    
    for i in range(len(predicted_tags)):
        if predicted_tags[i] == sent[1][i]:
          correct_pred = correct_pred+1
        
    total_pred = total_pred + len(predicted_tags)

    
  acc = correct_pred/total_pred
    

  return acc, correct_pred, total_pred


acc, correct_pred, total_pred = eval(my_model,test_data)


    


27959


In [84]:
##Accuracy on the test data of 
acc

0.9359546306361253

#While giving custom sentences, we have to take care of OOV words.