In [1]:
import os
import numpy as np

import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset, DataLoader

In [2]:
### Data files
DIR = os.getcwd().replace('notebooks', 'data')
print(os.listdir(DIR))

['Arabic.txt', 'Chinese.txt', 'Czech.txt', 'Dutch.txt', 'English.txt', 'French.txt', 'German.txt', 'Greek.txt', 'Irish.txt', 'Italian.txt', 'Japanese.txt', 'Korean.txt', 'Polish.txt', 'Portuguese.txt', 'Russian.txt', 'Scottish.txt', 'Spanish.txt', 'Vietnamese.txt']


In [3]:
class TextDataset(Dataset):
    '''
    Text Dataset object.
    '''
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.file_names = list(map(lambda x: os.path.join(self.root_dir, x), os.listdir(self.root_dir)))
        self.classes = list(map(lambda x: x.split('.')[0], os.listdir(self.root_dir)))
        
        self.int2label = dict(enumerate(self.classes))
        self.label2int = {v : k for (k, v) in self.int2label.items()}
        
        self.files = [self.read_file(f) for f in self.file_names]
        self.data, self.labels = list(), list()
        
        for file, label in self.files:
            self.data += file
            self.labels += label
        
        self.unique_characters = self.get_unique_chars()
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, ix):
        string = self.data[ix]
        string_data = torch.tensor([self.unique_characters[s] for s in string], dtype = torch.int32)
        return string_data, self.labels[ix]
    
    def read_file(self, f):
        with open(f, 'rb') as file:
            contents = str(file.read(), encoding = 'utf-8').split('\n')
        
        labels = [self.label2int[f.split('.')[0].split('\\')[-1]]]*len(contents)
        
        return contents[:-1], labels[:-1]
    
    def get_unique_chars(self):
        unique_chars = sorted(list(set(''.join(list(map(lambda x: ''.join(x[0]), self.files))))))
        return {v : k for (k, v) in dict(enumerate(unique_chars)).items()}

In [4]:
data = TextDataset(DIR)

In [5]:
data[0]

(tensor([17, 40, 47, 53, 50, 57], dtype=torch.int32), 0)

In [6]:
data[100][0].shape

torch.Size([5])

In [7]:
data.unique_characters

{' ': 0,
 "'": 1,
 ',': 2,
 '-': 3,
 '/': 4,
 '1': 5,
 ':': 6,
 'A': 7,
 'B': 8,
 'C': 9,
 'D': 10,
 'E': 11,
 'F': 12,
 'G': 13,
 'H': 14,
 'I': 15,
 'J': 16,
 'K': 17,
 'L': 18,
 'M': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'Q': 23,
 'R': 24,
 'S': 25,
 'T': 26,
 'U': 27,
 'V': 28,
 'W': 29,
 'X': 30,
 'Y': 31,
 'Z': 32,
 'a': 33,
 'b': 34,
 'c': 35,
 'd': 36,
 'e': 37,
 'f': 38,
 'g': 39,
 'h': 40,
 'i': 41,
 'j': 42,
 'k': 43,
 'l': 44,
 'm': 45,
 'n': 46,
 'o': 47,
 'p': 48,
 'q': 49,
 'r': 50,
 's': 51,
 't': 52,
 'u': 53,
 'v': 54,
 'w': 55,
 'x': 56,
 'y': 57,
 'z': 58,
 '\xa0': 59,
 'Á': 60,
 'É': 61,
 'ß': 62,
 'à': 63,
 'á': 64,
 'ã': 65,
 'ä': 66,
 'ç': 67,
 'è': 68,
 'é': 69,
 'ê': 70,
 'ì': 71,
 'í': 72,
 'ñ': 73,
 'ò': 74,
 'ó': 75,
 'õ': 76,
 'ö': 77,
 'ù': 78,
 'ú': 79,
 'ü': 80,
 'ą': 81,
 'ł': 82,
 'ń': 83,
 'Ś': 84,
 'Ż': 85,
 'ż': 86}

In [8]:
test = torch.randint(0, 20, size = (32, 5, 10))

In [9]:
embedder = nn.Embedding(100, 3)

In [10]:
embedder(data[0][0])

tensor([[-1.9723, -0.0480, -1.6166],
        [ 0.6346, -0.3220, -1.1672],
        [-0.1794,  0.8474, -1.0163],
        [-0.3584,  0.6559,  0.5314],
        [-0.2146,  1.0896,  0.8415],
        [-0.8657,  1.0268, -1.2825]], grad_fn=<EmbeddingBackward0>)

In [11]:
embedder(data[100][0])

tensor([[ 1.0383,  0.0691,  0.2952],
        [ 0.9406,  2.2510, -1.4453],
        [-0.3726, -1.0804, -0.3300],
        [-0.3726, -1.0804, -0.3300],
        [ 1.7683,  0.5268, -0.9234]], grad_fn=<EmbeddingBackward0>)

In [12]:
r = embedder(test)

In [13]:
r.shape

torch.Size([32, 5, 10, 3])

In [14]:
print(help(nn.RNN))

Help on class RNN in module torch.nn.modules.rnn:

class RNN(RNNBase)
 |  RNN(*args, **kwargs)
 |  
 |  Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
 |  input sequence.
 |  
 |  
 |  For each element in the input sequence, each layer computes the following
 |  function:
 |  
 |  .. math::
 |      h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
 |  
 |  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
 |  the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
 |  previous layer at time `t-1` or the initial hidden state at time `0`.
 |  If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
 |  
 |  Args:
 |      input_size: The number of expected features in the input `x`
 |      hidden_size: The number of features in the hidden state `h`
 |      num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
 |          would mean stacking tw

In [15]:
len(data)

20074

In [16]:
data[0][0]

tensor([32, 62, 21, 77, 33, 17], dtype=torch.int32)

In [17]:
def pad_and_pack(batch):
    data_ = []
    labels = []
    lengths = []
    
    for X, y in batch:
        data_.append(X)
        labels.append(y)
        lengths.append(X.shape[0])
        
    X_pad = torch.nn.utils.rnn.pad_sequence(data_, batch_first = False)
    X_pack = torch.nn.utils.rnn.pack_padded_sequence(X_pad, lengths, batch_first = False, enforce_sorted = False)
    
    return X_pack, torch.tensor(labels, dtype = torch.int64)

In [18]:
d = [(torch.randn(torch.randint(30, 40, (1,)).item(), 16), torch.randint(0, 5, (1,)).item()) for n in range(48)]

In [19]:
for i in d:
    print(i[0].shape)

torch.Size([39, 16])
torch.Size([35, 16])
torch.Size([34, 16])
torch.Size([39, 16])
torch.Size([38, 16])
torch.Size([34, 16])
torch.Size([32, 16])
torch.Size([36, 16])
torch.Size([39, 16])
torch.Size([36, 16])
torch.Size([34, 16])
torch.Size([39, 16])
torch.Size([31, 16])
torch.Size([32, 16])
torch.Size([36, 16])
torch.Size([33, 16])
torch.Size([36, 16])
torch.Size([31, 16])
torch.Size([30, 16])
torch.Size([31, 16])
torch.Size([30, 16])
torch.Size([38, 16])
torch.Size([36, 16])
torch.Size([34, 16])
torch.Size([37, 16])
torch.Size([38, 16])
torch.Size([34, 16])
torch.Size([32, 16])
torch.Size([37, 16])
torch.Size([38, 16])
torch.Size([39, 16])
torch.Size([34, 16])
torch.Size([33, 16])
torch.Size([37, 16])
torch.Size([34, 16])
torch.Size([36, 16])
torch.Size([32, 16])
torch.Size([35, 16])
torch.Size([39, 16])
torch.Size([37, 16])
torch.Size([33, 16])
torch.Size([30, 16])
torch.Size([31, 16])
torch.Size([31, 16])
torch.Size([30, 16])
torch.Size([32, 16])
torch.Size([36, 16])
torch.Size([3

In [20]:
a, b = pad_and_pack(d)

In [21]:
train_sampler, test_sampler = torch.utils.data.random_split(data, lengths = [len(data)-300, 300])

train_dl = DataLoader(train_sampler, batch_size = 8, shuffle = True, collate_fn = pad_and_pack)
test_dl = DataLoader(test_sampler, batch_size = 8, shuffle = False, collate_fn = pad_and_pack)

In [22]:
class PackedEmbedding(nn.Module):
    def __init__(self, embedding_layer):
        super(PackedEmbedding, self).__init__()
        self.embedding = embedding_layer
        
    def forward(self, x):
        if type(x) == torch.nn.utils.rnn.PackedSequence:
            unpacked_sequence, lengths = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first = True)
            y = self.embedding(unpacked_sequence)
            y = torch.nn.utils.rnn.pack_padded_sequence(y, lengths, batch_first = True, enforce_sorted = False)
        else:
            y = self.embedding(x)
        
        return y

In [23]:
class RNNNetwork(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_class, feature_size, p = 0.3):
        super(RNNNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_class = num_class
        self.feature_size = feature_size
        self.p = p
        
        self.embedder = PackedEmbedding(nn.Embedding(self.vocab_size, self.feature_size))
        self.rnn = nn.RNN(input_size = self.feature_size, hidden_size = self.hidden_size, batch_first = True)
        self.fc = nn.Linear(self.hidden_size, self.num_class)
        self.dropout = nn.Dropout(p = self.p)
        
    def forward(self, x):
        x = self.embedder(x)
        output, state = self.rnn(x)
        if type(state) == tuple:
            state = state[0]
        state = state.permute(1, 0, 2).squeeze()
        return torch.log_softmax(self.fc(state), dim = -1)

In [24]:
### Instantiate network
rnn = RNNNetwork(vocab_size = len(data.unique_characters), hidden_size = 256,
                 num_class = len(data.classes), p = 0.45, feature_size = 64)

In [25]:
print(rnn)

RNNNetwork(
  (embedder): PackedEmbedding(
    (embedding): Embedding(87, 64)
  )
  (rnn): RNN(64, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.45, inplace=False)
)


In [26]:
epochs = 250
lr = 1e-5
criterion = nn.NLLLoss()

optimizer = optim.Adam([{'params': rnn.embedder.parameters(), 'lr': lr/2},
                        {'params': rnn.rnn.parameters(), 'lr': lr},
                        {'params': rnn.fc.parameters(), 'lr': lr/2}], lr = lr)

scheduler = optim.lr_scheduler.CyclicLR(optimizer, cycle_momentum = False,
                                        base_lr = lr/10, max_lr = lr*10)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
train_loss = list()
test_loss = list()
train_acc = list()
test_acc = list()

history = dict()

for epoch in range(1, epochs +1):
    rnn.train()
    for X, y in train_dl:
        y_pred = rnn(X)
        
        loss = criterion(y_pred, y)
        acc = accuracy_score(y, y_pred.max(dim = -1).indices)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        train_acc.append(acc)
    
    scheduler.step()
    rnn.eval()
    with torch.no_grad():
        for X_, y_ in test_dl:
            y_p = rnn(X_)
            
            acc = accuracy_score(y_, y_p.max(dim = -1).indices)
            loss = criterion(y_p, y_)
            
            test_loss.append(loss.item())
            test_acc.append(acc)
    
    history[epoch] = dict()
    history[epoch]['train_loss'] = sum(train_loss)/len(train_loss)
    history[epoch]['train_acc'] = sum(train_acc)/len(train_acc)
    
    history[epoch]['test_loss'] = sum(test_loss)/len(test_loss)
    history[epoch]['test_acc'] = sum(test_acc)/len(test_acc)
    
    print(f"Epoch {epoch:02d}:",
          f"\n\tTrain loss -> {history[epoch]['train_loss']: .4f} | Test loss -> {history[epoch]['test_loss']: .4f}")
    print(f"\tTrain accuracy -> {history[epoch]['train_acc']: .4f} | Test accuracy -> {history[epoch]['test_acc']: .4f}")
    
    train_loss.clear()
    test_loss.clear()
    train_acc.clear()
    test_acc.clear()
    

Epoch 01: 
	Train loss ->  2.7137 | Test loss ->  2.5515
	Train accuracy ->  0.2592 | Test accuracy ->  0.4474
Epoch 02: 
	Train loss ->  2.3719 | Test loss ->  2.1325
	Train accuracy ->  0.4575 | Test accuracy ->  0.4967
Epoch 03: 
	Train loss ->  1.9454 | Test loss ->  1.7218
	Train accuracy ->  0.4703 | Test accuracy ->  0.4901
Epoch 04: 
	Train loss ->  1.7270 | Test loss ->  1.5967
	Train accuracy ->  0.4701 | Test accuracy ->  0.4901
Epoch 05: 
	Train loss ->  1.6502 | Test loss ->  1.5351
	Train accuracy ->  0.4775 | Test accuracy ->  0.5000
Epoch 06: 
	Train loss ->  1.6003 | Test loss ->  1.4921
	Train accuracy ->  0.4926 | Test accuracy ->  0.5099
Epoch 07: 
	Train loss ->  1.5588 | Test loss ->  1.4563
	Train accuracy ->  0.5123 | Test accuracy ->  0.5395
Epoch 08: 
	Train loss ->  1.5207 | Test loss ->  1.4223
	Train accuracy ->  0.5286 | Test accuracy ->  0.5625
Epoch 09: 
	Train loss ->  1.4849 | Test loss ->  1.3902
	Train accuracy ->  0.5496 | Test accuracy ->  0.5658
E

Epoch 75: 
	Train loss ->  0.7804 | Test loss ->  0.7949
	Train accuracy ->  0.7709 | Test accuracy ->  0.7730
Epoch 76: 
	Train loss ->  0.7760 | Test loss ->  0.7909
	Train accuracy ->  0.7706 | Test accuracy ->  0.7796
Epoch 77: 
	Train loss ->  0.7714 | Test loss ->  0.7834
	Train accuracy ->  0.7725 | Test accuracy ->  0.7697
Epoch 78: 
	Train loss ->  0.7667 | Test loss ->  0.7803
	Train accuracy ->  0.7735 | Test accuracy ->  0.7730
Epoch 79: 
	Train loss ->  0.7623 | Test loss ->  0.7759
	Train accuracy ->  0.7743 | Test accuracy ->  0.7763
Epoch 80: 
	Train loss ->  0.7578 | Test loss ->  0.7707
	Train accuracy ->  0.7751 | Test accuracy ->  0.7763
Epoch 81: 
	Train loss ->  0.7536 | Test loss ->  0.7674
	Train accuracy ->  0.7769 | Test accuracy ->  0.7730
Epoch 82: 
	Train loss ->  0.7492 | Test loss ->  0.7651
	Train accuracy ->  0.7776 | Test accuracy ->  0.7829
Epoch 83: 
	Train loss ->  0.7448 | Test loss ->  0.7604
	Train accuracy ->  0.7788 | Test accuracy ->  0.7829
E

Epoch 149: 
	Train loss ->  0.5266 | Test loss ->  0.5819
	Train accuracy ->  0.8391 | Test accuracy ->  0.8355
Epoch 150: 
	Train loss ->  0.5248 | Test loss ->  0.5838
	Train accuracy ->  0.8405 | Test accuracy ->  0.8289
Epoch 151: 
	Train loss ->  0.5218 | Test loss ->  0.5765
	Train accuracy ->  0.8413 | Test accuracy ->  0.8355
Epoch 152: 
	Train loss ->  0.5196 | Test loss ->  0.5794
	Train accuracy ->  0.8418 | Test accuracy ->  0.8388
Epoch 153: 
	Train loss ->  0.5170 | Test loss ->  0.5763
	Train accuracy ->  0.8422 | Test accuracy ->  0.8355
Epoch 154: 
	Train loss ->  0.5145 | Test loss ->  0.5901
	Train accuracy ->  0.8425 | Test accuracy ->  0.8322
Epoch 155: 
	Train loss ->  0.5120 | Test loss ->  0.5818
	Train accuracy ->  0.8429 | Test accuracy ->  0.8289
Epoch 156: 
	Train loss ->  0.5091 | Test loss ->  0.5732
	Train accuracy ->  0.8449 | Test accuracy ->  0.8388
Epoch 157: 
	Train loss ->  0.5074 | Test loss ->  0.5756
	Train accuracy ->  0.8442 | Test accuracy -> 

KeyboardInterrupt: 

In [29]:
### Save model checkpoint
checkpoint = {
                'model_state_dict' : rnn.state_dict(),
                'optimizer_state_dict' : optimizer.state_dict(),
                'scheduler_state_dict' : scheduler.state_dict(),
                'epochs' : epochs,
                'history' : history
            }
torch.save(checkpoint, 'rnn-checkpoint.ckpt')