<a href="https://colab.research.google.com/github/DelmiroDaladier/NLP-studies/blob/master/POS_Tagging%20(Pytorch)/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

import nltk
import string
from nltk.corpus import brown

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f3337e36f60>

In [3]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [5]:
word_to_index = {}
tag_to_index = {}
index_to_tag = {}
tagged_sentences = brown.tagged_sents()
punctuation = string.punctuation

for sentence in tagged_sentences:
  for word, tag in sentence:
    if word not in word_to_index and  word not in punctuation:
      word_to_index[word] = len(word_to_index)
    if tag not in tag_to_index and tag not in punctuation:
      tag_to_index[tag] = len(tag_to_index)
      index_to_tag[len(tag_to_index)-1] = tag

In [6]:
data = []
for sentence in tagged_sentences:
  sentence_words = []
  sentence_tags = []
  for word, tag in sentence:
    if word in word_to_index and tag in tag_to_index:
      sentence_words.append(word)
      sentence_tags.append(tag)
  data.append((sentence_words, sentence_tags))

In [7]:
train_data = data[0:2000]
train_data = [pair for pair in train_data if len(pair[0]) > 0 and len(pair[1]) > 0  ]

test_data = data[2000:]
test_data = [pair for pair in test_data if len(pair[0]) > 0 and len(pair[1]) > 0  ]

In [8]:
class PosTagger(nn.Module):

  def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
    super(PosTagger, self).__init__()
    self.hidden_dim = hidden_dim
    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    self.hidden2tag = nn.Linear(hidden_dim, target_size)

  def forward(self, sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
    tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
    tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_scores

In [None]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = PosTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_index), len(tag_to_index))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(2000):
  print(f'Epoch:{epoch}')
  for sentence, tags in train_data:
    model.zero_grad()

    sentence_in = prepare_sequence(sentence, word_to_index)
    targets = prepare_sequence(tags, tag_to_index)

    tag_scores = model(sentence_in)

    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()


Epoch:0
Epoch:1
Epoch:2
Epoch:3
Epoch:4
Epoch:5
Epoch:6
Epoch:7
Epoch:8
Epoch:9
Epoch:10
Epoch:11
Epoch:12
Epoch:13
Epoch:14
Epoch:15
Epoch:16
Epoch:17
Epoch:18
Epoch:19
Epoch:20
Epoch:21
Epoch:22
Epoch:23
Epoch:24
Epoch:25
Epoch:26
Epoch:27
Epoch:28
Epoch:29
Epoch:30
Epoch:31
Epoch:32
Epoch:33
Epoch:34
Epoch:35
Epoch:36
Epoch:37
Epoch:38
Epoch:39
Epoch:40
Epoch:41
Epoch:42
Epoch:43
Epoch:44
Epoch:45
Epoch:46
Epoch:47
Epoch:48
Epoch:49
Epoch:50
Epoch:51
Epoch:52
Epoch:53
Epoch:54
Epoch:55
Epoch:56
Epoch:57
Epoch:58
Epoch:59
Epoch:60
Epoch:61
Epoch:62
Epoch:63
Epoch:64
Epoch:65
Epoch:66
Epoch:67
Epoch:68
Epoch:69
Epoch:70
Epoch:71
Epoch:72
Epoch:73
Epoch:74
Epoch:75
Epoch:76
Epoch:77
Epoch:78
Epoch:79
Epoch:80
Epoch:81
Epoch:82
Epoch:83
Epoch:84
Epoch:85
Epoch:86
Epoch:87
Epoch:88
Epoch:89
Epoch:90
Epoch:91
Epoch:92
Epoch:93
Epoch:94
Epoch:95
Epoch:96
Epoch:97
Epoch:98
Epoch:99
Epoch:100
Epoch:101
Epoch:102
Epoch:103
Epoch:104
Epoch:105
Epoch:106
Epoch:107
Epoch:108
Epoch:109
Epoch:110


In [None]:
with torch.no_grad():
  inputs = prepare_sequence(test_data[0][0], word_to_index)
  tag_scores = model(inputs)
  for id, tag in enumerate(tag_scores):
    index = tag.argmax()
    print(f'Predicted tag:{index_to_tag[index.item()]}')
    print(f'Real tag:{test_data[0][1][id]}')

In [None]:
test_data[0][1][0]

tensor(6)
tensor(0)
tensor(7)
tensor(24)
tensor(0)
tensor(3)
tensor(14)
tensor(12)
tensor(14)
tensor(16)
tensor(1)
tensor(1)
tensor(7)
tensor(8)
tensor(7)
tensor(2)
tensor(11)
tensor(7)
tensor(0)
tensor(6)
tensor(6)
tensor(7)
tensor(13)
tensor(20)
tensor(8)
tensor(2)
tensor(8)
tensor(0)
tensor(10)
tensor(6)
tensor(9)
tensor(13)
tensor(8)
tensor(6)
tensor(0)
tensor(8)
tensor(2)


In [None]:
len(test_data[0][0])

37