<a href="https://colab.research.google.com/github/DenisOgr/kaggle-experiments/blob/master/NLP_FROM_SCRATCH_GENERATING_NAMES_WITH_A_CHARACTER_LEVEL_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NLP FROM SCRATCH: GENERATING NAMES WITH A CHARACTER-LEVEL RNN
Source: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html


In [0]:
import torch
from torch import nn
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
import random

### Getting data

In [2]:
!mkdir /content/data
!wget https://download.pytorch.org/tutorial/data.zip -P /content/data
!unzip -o -d /content/data/ /content/data/data.zip 

--2020-03-07 16:21:31--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.224.253.59, 13.224.253.92, 13.224.253.46, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.224.253.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘/content/data/data.zip’


2020-03-07 16:21:31 (27.9 MB/s) - ‘/content/data/data.zip’ saved [2882130/2882130]

Archive:  /content/data/data.zip
   creating: /content/data/data/
  inflating: /content/data/data/eng-fra.txt  
   creating: /content/data/data/names/
  inflating: /content/data/data/names/Arabic.txt  
  inflating: /content/data/data/names/Chinese.txt  
  inflating: /content/data/data/names/Czech.txt  
  inflating: /content/data/data/names/Dutch.txt  
  inflating: /content/data/data/names/English.txt  
  inflating: /content/data/data/names/French.txt  
  inflating: /content/data/data/names/German.txt  
  inflating: 

### Preprocessing

In [0]:
hidden_size = 128
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker
#def Utf2Ascii
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [0]:
#def get dict all_categories
all_categories = {}

for file in glob.glob('/content/data/data/names/*'):
  category = os.path.basename(file).split('.')[0].lower()
  with open(file,encoding='utf-8') as f:
    l = [unicodeToAscii(name) for name in f.read().split("\n")]
    all_categories[category] = list(filter(lambda x: x != "", l))
n_categories = len(all_categories)
all_categories_names = list(all_categories.keys())  

In [5]:
print(all_categories_names)
print(all_categories['dutch'][:10])

['korean', 'french', 'scottish', 'german', 'vietnamese', 'chinese', 'english', 'spanish', 'czech', 'irish', 'russian', 'greek', 'polish', 'japanese', 'portuguese', 'dutch', 'italian', 'arabic']
['Aalsburg', 'Aalst', 'Aarle', 'Achteren', 'Achthoven', 'Adrichem', 'Aggelen', 'Agteren', 'Agthoven', 'Akkeren']


### Training

In [0]:
def randomItem(all_categories):
  '''
  input dict all_categories {'string': <list[string]>}
  return: string category, string name
  '''
  category = random.choice(all_categories_names)
  name = random.choice(all_categories[category])
  return category, name

def cat2tensor(category):
  '''
  input: string category
  return: torch.Tensor((1,n_categories))
  '''
  tensor = torch.zeros((1, n_categories))
  tensor[0][all_categories_names.index(category)] = 1
  return tensor  

def name2tensor(name):
  '''
  input string name
  return torch.Tensor((len(name), n_letters))
  ''' 
  tensor = torch.zeros((len(name), 1, n_letters))
  for idx, ch in enumerate(name):
    tensor[idx][0][all_letters.index(ch)] = 1
  return tensor

def name2outputTensor(name):
  '''
  Compute output with EOS
  input string name
  return torch.Tensor((len(name),n_letters))
  '''
  l = [all_letters.index(ch) for ch in name]
  l.append(n_letters-1) #add EOS
  return torch.LongTensor(l)

def output2letter(output):
  idx = output.topk(1)[1][0][0].item()
  try:
    return all_letters[idx]
  except IndexError:
    return '<EOS>'


#### Model

In [0]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()
    comb_input_size = n_categories + input_size+hidden_size
    self.i2h = nn.Linear(comb_input_size, hidden_size)
    self.i2o = nn.Linear(comb_input_size, output_size)
    self.o2o = nn.Linear(output_size+hidden_size, output_size)
    self.dropout = nn.Dropout(p=0.1)
    self.softmax = nn.LogSoftmax(dim=1)
  
  def forward(self, caterory, letter, hidden):
    combined = torch.cat((caterory, letter, hidden), dim=1)
    hidden = self.i2h(combined)
    output = self.i2o(combined)

    combined = torch.cat((hidden, output), dim=1)
    output = self.softmax(self.dropout(self.o2o(combined)))

    return hidden, output

In [0]:
# rnn = RNN(n_letters, hidden_size, n_letters)
# c, n = randomItem(all_categories)
# hidden = torch.zeros((1, hidden_size))
# rnn(cat2tensor(c), name2tensor(n)[0], hidden)

#### Train process

In [0]:
def train(category, name):
  '''
  Init hidden state
  Iterate over all characters in word
  Compute loss and gradients
  Apply gradients to network parametrs 
  '''
  rnn.zero_grad()
  all_loss = 0
  hidden = torch.zeros((1, hidden_size)).cuda()
  catTensor = cat2tensor(category).cuda()
  nameTensor = name2tensor(name).cuda()
  targetTensor = name2outputTensor(name).cuda()
  targetTensor.unsqueeze_(-1)

  for i in range(nameTensor.size(0)):
    input = nameTensor[i]
    
    hidden, output = rnn(catTensor, input, hidden)
    loss = criteria(output, targetTensor[i])
    all_loss += loss
  
  all_loss.backward()

  for p in rnn.parameters():
    p.data.add_(-lr, p.grad.data)

  return all_loss.item() / nameTensor.size(0)

In [21]:
#rnn = RNN(n_letters, hidden_size, n_letters)
#rnn.cuda()
lr = 0.0005
n_iteration=100000
criteria = nn.NLLLoss()
all_loss = 0
print_every = 5000

for i in range(1, n_iteration):
  category, name = randomItem(all_categories)
  loss = train(category, name)
  all_loss +=loss
  if i % print_every == 0:
    print("[%d]: loss: %.2f"%(i, all_loss/print_every))
    all_loss=0


[5000]: loss: 0.34
[10000]: loss: 0.33
[15000]: loss: 0.32
[20000]: loss: 0.31
[25000]: loss: 0.31
[30000]: loss: 0.30
[35000]: loss: 0.29
[40000]: loss: 0.28
[45000]: loss: 0.28
[50000]: loss: 0.27
[55000]: loss: 0.26
[60000]: loss: 0.27
[65000]: loss: 0.26
[70000]: loss: 0.25
[75000]: loss: 0.25
[80000]: loss: 0.25
[85000]: loss: 0.25
[90000]: loss: 0.25
[95000]: loss: 0.24


In [22]:
torch.save(rnn, './rnn.model')

  "type " + obj.__name__ + ". It won't be checked "


#### Evaluate process

### Predict/evaluate

In [0]:
def generate(cat, letter, max_size = 10):
  with torch.no_grad():
    result = letter
    categoryTensor = cat2tensor(cat).cuda()
    hidden = torch.zeros((1, hidden_size)).cuda()
    for _ in range(max_size):
      letterTensor = name2tensor(letter).cuda()
      hidden, output = rnn(categoryTensor, letterTensor[0], hidden)
      letter = output2letter(output)
      if letter != '<EOS>':
        result += letter
  return result



In [24]:
generate('russian', 'A')

'AAkkkkkkkkk'