<a href="https://colab.research.google.com/github/DelmiroDaladier/NLP-studies/blob/master/POS_Tagging%20(Pytorch)/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

import nltk
import string
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f1a82f10f60>

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [9]:
word_to_index = {}
tag_to_index = {}
index_to_tag = {}
tagged_sentences = treebank.tagged_sents()
punctuation = string.punctuation

for sentence in tagged_sentences:
  for word, tag in sentence:
    if word not in word_to_index and  word not in punctuation:
      word_to_index[word] = len(word_to_index)
    if tag not in tag_to_index and tag not in punctuation:
      tag_to_index[tag] = len(tag_to_index)
      index_to_tag[len(tag_to_index)-1] = tag

In [5]:
data = []
for sentence in tagged_sentences:
  sentence_words = []
  sentence_tags = []
  for word, tag in sentence:
    if word in word_to_index and tag in tag_to_index:
      sentence_words.append(word)
      sentence_tags.append(tag)
  data.append((sentence_words, sentence_tags))

In [6]:
train_data = data[0:2000]
train_data = [pair for pair in train_data if len(pair[0]) > 0 and len(pair[1]) > 0  ]

test_data = data[2000:]
test_data = [pair for pair in test_data if len(pair[0]) > 0 and len(pair[1]) > 0  ]

In [7]:
class PosTagger(nn.Module):

  def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
    super(PosTagger, self).__init__()
    self.hidden_dim = hidden_dim
    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    self.hidden2tag = nn.Linear(hidden_dim, target_size)

  def forward(self, sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_scores

In [None]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = PosTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_index), len(tag_to_index))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(2000):
  print(f'Epoch:{epoch}')
  for sentence, tags in train_data:
    model.zero_grad()

    sentence_in = prepare_sequence(sentence, word_to_index)
    targets = prepare_sequence(tags, tag_to_index)

    tag_scores = model(sentence_in)

    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()


Epoch:0
Epoch:1
Epoch:2
Epoch:3
Epoch:4
Epoch:5
Epoch:6
Epoch:7
Epoch:8
Epoch:9


In [None]:
with torch.no_grad():
  inputs = prepare_sequence(test_data[0][0], word_to_index)
  tag_scores = model(inputs)
  for id, tag in enumerate(tag_scores):
    index = tag.argmax()
    print(f'Predicted tag:{index_to_tag[index.item()]}')
    print(f'Real tag:{test_data[0][1][id]}')

In [34]:
test_data[0][1][0]

'DT'

tensor(6)
tensor(0)
tensor(7)
tensor(24)
tensor(0)
tensor(3)
tensor(14)
tensor(12)
tensor(14)
tensor(16)
tensor(1)
tensor(1)
tensor(7)
tensor(8)
tensor(7)
tensor(2)
tensor(11)
tensor(7)
tensor(0)
tensor(6)
tensor(6)
tensor(7)
tensor(13)
tensor(20)
tensor(8)
tensor(2)
tensor(8)
tensor(0)
tensor(10)
tensor(6)
tensor(9)
tensor(13)
tensor(8)
tensor(6)
tensor(0)
tensor(8)
tensor(2)


In [23]:
len(test_data[0][0])

37