<a href="https://colab.research.google.com/github/Bipinoli/AI_Adventure/blob/master/Pytorch_Seq_model_and_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

#Experimentation

In [40]:
lstm = nn.LSTM(input_size=3, hidden_size=4)
print(lstm)


torch.manual_seed(1)
inputs = [torch.randn(1,3) for _ in range(5)]
print(inputs)

hidden = (torch.randn(1,1,4), torch.randn(1,1,4))
original_hidden = hidden
print(hidden)

for i in inputs:
  out, hidden = lstm(i.view(1,1,-1), hidden)
  print("short+long memory", hidden)
  print("short/out  memory:  ", out)
  
print(inputs)
print(torch.cat(inputs, dim=1))
print(torch.cat(inputs, dim=0))

# instead of for loop we can do this
hidden = original_hidden
out, hidden = lstm(torch.cat(inputs, dim=0).view(len(inputs), 1, -1), hidden)
print(out)
print(hidden)
print("comparing preivous output with this output should clear things out")

LSTM(3, 4)
[tensor([[0.6614, 0.2669, 0.0617]]), tensor([[ 0.6213, -0.4519, -0.1661]]), tensor([[-1.5228,  0.3817, -1.0276]]), tensor([[-0.5631, -0.8923, -0.0583]]), tensor([[-0.1955, -0.9656,  0.4224]])]
(tensor([[[ 0.2673, -0.4212, -0.5107, -1.5727]]]), tensor([[[-0.1232,  3.5870, -1.8313,  1.5987]]]))
short+long memory (tensor([[[-0.1627,  0.3379, -0.4808,  0.3262]]], grad_fn=<StackBackward>), tensor([[[-0.3569,  2.4609, -0.6867,  0.6246]]], grad_fn=<StackBackward>))
short/out  memory:   tensor([[[-0.1627,  0.3379, -0.4808,  0.3262]]], grad_fn=<StackBackward>)
short+long memory (tensor([[[-0.1496,  0.5506, -0.3423,  0.0749]]], grad_fn=<StackBackward>), tensor([[[-0.2599,  1.6627, -0.4990,  0.1304]]], grad_fn=<StackBackward>))
short/out  memory:   tensor([[[-0.1496,  0.5506, -0.3423,  0.0749]]], grad_fn=<StackBackward>)
short+long memory (tensor([[[-0.2471,  0.1699, -0.0382,  0.2335]]], grad_fn=<StackBackward>), tensor([[[-0.6025,  0.6072, -0.0659,  0.5003]]], grad_fn=<StackBackward>)

#LSTM for part of speech tagging

##Data Prepration

In [68]:
training_data = [
    ("My name is bipin oli".split(), ["Pronoun", "Noun", "Verb", "Noun", "Noun"]),
    ("Buddha was born in Nepal".split(), ["Noun", "Verb", "Verb", "Preposition", "Noun"]),
    ("A child is a father of a man".split(), ["Determiner", "Noun", "Verb", "Determiner", "Noun", "Preposition", "Determiner", "Noun"])
]

print(training_data)

# construct vocubulary
words_vocab = {}
pos_vocab = {}
pos_names = {}

for row in training_data:
  for word in row[0]:
    if word.lower() not in words_vocab:
      words_vocab[word.lower()] = len(words_vocab)
  for pos in row[1]:
    if pos not in pos_vocab:
      pos_vocab[pos] = len(pos_vocab)
      pos_names[pos_vocab[pos]] = pos
      
print(words_vocab)
print(pos_vocab)
print(pos_names)

[(['My', 'name', 'is', 'bipin', 'oli'], ['Pronoun', 'Noun', 'Verb', 'Noun', 'Noun']), (['Buddha', 'was', 'born', 'in', 'Nepal'], ['Noun', 'Verb', 'Verb', 'Preposition', 'Noun']), (['A', 'child', 'is', 'a', 'father', 'of', 'a', 'man'], ['Determiner', 'Noun', 'Verb', 'Determiner', 'Noun', 'Preposition', 'Determiner', 'Noun'])]
{'my': 0, 'name': 1, 'is': 2, 'bipin': 3, 'oli': 4, 'buddha': 5, 'was': 6, 'born': 7, 'in': 8, 'nepal': 9, 'a': 10, 'child': 11, 'father': 12, 'of': 13, 'man': 14}
{'Pronoun': 0, 'Noun': 1, 'Verb': 2, 'Preposition': 3, 'Determiner': 4}
{0: 'Pronoun', 1: 'Noun', 2: 'Verb', 3: 'Preposition', 4: 'Determiner'}


##LSTM model

In [0]:
import torch.nn.functional as F

class LSTMTagger(nn.Module):
  
  def __init__(self, words_vocab_size, pos_vocab_size, word_vector_dim):
    super(LSTMTagger, self).__init__()
    
    self.wordEmbeds = nn.Embedding(words_vocab_size, word_vector_dim)
    self.lstm = nn.LSTM(input_size = word_vector_dim, hidden_size = word_vector_dim)
    self.hidden2tag = nn.Linear(word_vector_dim, pos_vocab_size)
    
    
  def forward(self, sentence): # sentence is a list of words
    # it may seem that this network is not using any non linear activation 
    # funcations but inside LSTM there are sigmoid ans tanh activations 
    embeds = self.wordEmbeds(sentence)
    outs, hidden_state = self.lstm(embeds.view(len(sentence), 1, -1))
    tags = self.hidden2tag(outs.view(len(sentence), -1))
    tags_score = F.log_softmax(tags, dim=1)
    return tags_score  

##Training

In [83]:
model = LSTMTagger(len(words_vocab), len(pos_vocab), word_vector_dim = 7)
loss_func = nn.NLLLoss() # it expects log_porbabilites and class_values as input
optimizer = optim.SGD(model.parameters(), lr=0.1)

print("parameters to learn value of:")
for name, param in model.named_parameters():
  print(name)


for epoch in range(300): # just going overboard with toy data
  for data in training_data:
    sentence = torch.tensor([words_vocab[word.lower()] for word in data[0]], dtype=torch.long)
    pos = torch.tensor([pos_vocab[p] for p in data[1]], dtype=torch.long)

    model.zero_grad()
    pos_scores = model(sentence)
    loss = loss_func(pos_scores, pos)
    loss.backward()
    optimizer.step()
    

parameters to learn value of:
wordEmbeds.weight
lstm.weight_ih_l0
lstm.weight_hh_l0
lstm.bias_ih_l0
lstm.bias_hh_l0
hidden2tag.weight
hidden2tag.bias


##Testing

In [92]:
with torch.no_grad():
  
  # it is not made to work with unseen words
  test_data = ("Buddha was a child of a man").split()
  
  sentence = torch.tensor([words_vocab[word.lower()] for word in test_data], dtype=torch.long)
  
  pred_scores = model(sentence)
  _, pred_indexes = torch.max(pred_scores, dim=1) # across dim 1
  
  print("Tagging")
  for word in test_data:
    print(word, end=" ")
  print()
  for index in pred_indexes:
    print(pos_names[index.item()], end=" ")

Tagging
Buddha was a child of a man 
Noun Verb Determiner Noun Preposition Determiner Noun 