# Recurrent Neural Networks

In [178]:
import numpy as np
import torch
from torch import nn
import requests
import zipfile
from io import BytesIO
import unicodedata
from sklearn.preprocessing import LabelEncoder
import string

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

Device: cuda


## Data Process

### Collecting data

In [245]:
# Extract data
response = requests.get(
    'https://download.pytorch.org/tutorial/data.zip',
    verify=False
)

data, labels = [], []
with zipfile.ZipFile(BytesIO(response.content)) as zfile:
    for filename in [
        f 
        for f in zfile.namelist() 
        if f.endswith('.txt') and f.startswith('data/names')
    ]:
        with zfile.open(filename) as file:
            lines = file.read().decode('utf-8').strip().split('\n')
            names = [
                unicodedata.normalize(
                    'NFKD', 
                    line
                ).encode('ascii', 'ignore').decode('utf-8')
                for line in lines
            ]
            category = filename.split('/')[-1].split('.')[0]
            temp_labels = np.repeat(category, len(names))
            
            data.extend(names)
            labels.extend(temp_labels)
            



### Label Encoding

In [365]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
categories = label_encoder.classes_

def label_to_tensor(label, encoder):
    label = label if isinstance(label, list) else [label]
    return torch.LongTensor(
        encoder.transform(label).reshape(-1, 1)
    ).to(device)

np.vstack([categories, label_encoder.transform(categories)]).T

array([['Arabic', '0'],
       ['Chinese', '1'],
       ['Czech', '2'],
       ['Dutch', '3'],
       ['English', '4'],
       ['French', '5'],
       ['German', '6'],
       ['Greek', '7'],
       ['Irish', '8'],
       ['Italian', '9'],
       ['Japanese', '10'],
       ['Korean', '11'],
       ['Polish', '12'],
       ['Portuguese', '13'],
       ['Russian', '14'],
       ['Scottish', '15'],
       ['Spanish', '16'],
       ['Vietnamese', '17']], dtype='<U11')

### Name Encoding

In [366]:
dictionary = string.ascii_letters + " .,;'-/:" + '0123456789'

def name_to_tensor(name):
    tensor = torch.zeros(len(name), len(dictionary)).to(device)
    for i, char in enumerate(name):
        try:
            tensor[i][dictionary.index(char)] = 1
        except:
            raise ValueError(f'Character {char} not in dictionary')
    return tensor

name_to_tensor('Edy')

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')

### Balancing Classes Within Each Batch

In [386]:
def sample_batch(data, labels, encoder, size):
    data = data.copy()
    labels = labels.copy()
    
    data_batch, labels_batch = [], []

    categories, _ = np.unique(labels, return_counts=True)

    for cat in categories:
        idxs_cat = [i for i, l in enumerate(labels) if l == cat]

        selected_idx = np.random.choice(idxs_cat, size=size, replace=True)

        
        data_sample = np.array(data)[selected_idx]
        labels_sample = np.array(labels)[selected_idx]      
        
        data_batch.extend(data_sample)
        labels_batch.extend(labels_sample)

    
    data_tns = [name_to_tensor(d) for d in data_batch]
    labels_tns = label_to_tensor(labels_batch, encoder=encoder)
    return data_tns, labels_tns

data_batch, rotulos_batch = sample_batch(
  data, labels, encoder=label_encoder, size=32
)

## RNN Cell

In [457]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.recurrent = nn.RNNCell(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.activation = nn.LogSoftmax(dim=1)

    def forward(self, name):
        h = torch.zeros(1, self.hidden_size).to(device)
        
        for char in name:
            h = self.recurrent(char.unsqueeze(0), h)
        
        output =self.output(h)
        output = self.activation(output)

        return output



In [458]:
def train_step(model, criterion, optimizer, data, labels):
    model.train()

    loss_epoca = []

    for name, label in zip(data, labels):
        output = model(name)
        loss = criterion(output, label)
        loss_epoca.append(loss.detach().cpu().numpy())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return np.array(loss_epoca).mean(), np.array(loss_epoca).std()

In [460]:
model = RNN(len(dictionary), 256, len(categories))
model.to(device)

print(model)

criterion = nn.NLLLoss().to(device)
optim_parameters = {
    'lr': 5e-5,
    'weight_decay': 1e-7
}
optimizer = torch.optim.Adam(model.parameters(), **optim_parameters)

for epoch in range(200):
    data_batch, labels_batch = sample_batch(data, labels, label_encoder, 50)
    loss_mean, loss_std = train_step(model, criterion, optimizer, data_batch, labels_batch)
    print(loss_mean, loss_std)

RNN(
  (recurrent): RNNCell(70, 256)
  (output): Linear(in_features=256, out_features=18, bias=True)
  (activation): LogSoftmax(dim=1)
)
2.784439 0.48785064
2.6883578 0.5690623
2.7020793 0.6311553
2.7034342 0.55232114
2.6746027 0.5526362
2.6145952 0.56827426
2.6231782 0.5710179
2.5376334 0.56354606
2.5659747 0.6169963
2.5164654 0.59072524
2.4344406 0.70919186
2.3424273 0.74793226
2.2299218 0.7966887
2.1566365 0.82914597
2.0521865 0.8440369
2.113647 0.9492568
2.169059 0.9362336
2.1218321 0.8855945
2.0815325 0.8750496
2.0778499 0.8690164
2.0237267 0.88405126
1.9807699 0.8873127
2.0654137 0.97155064
2.1196823 0.940999
2.0822706 0.94067717
2.0631886 0.92870706
2.0547864 0.9654269
2.074694 0.96843135
2.023165 0.99979055
2.0120852 0.9161863
1.9230075 0.88972884
1.9693134 0.95069516
1.8894516 0.891388
1.8910749 0.92503774
1.8827585 0.96744275
1.8466097 0.9440686
1.8283675 0.9961492
1.8137956 0.9543905
1.8545247 1.0211815
1.9163531 1.0181613
1.9153602 1.0293472
1.9655445 0.9859263
1.9113257 1.

KeyboardInterrupt: 