In [1]:
import os
import numpy as np

import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [2]:
### Data files
DIR = os.getcwd().replace('notebooks', 'data')
print(os.listdir(DIR))

['Arabic.txt', 'Chinese.txt', 'Czech.txt', 'Dutch.txt', 'English.txt', 'French.txt', 'German.txt', 'Greek.txt', 'Irish.txt', 'Italian.txt', 'Japanese.txt', 'Korean.txt', 'Polish.txt', 'Portuguese.txt', 'Russian.txt', 'Scottish.txt', 'Spanish.txt', 'Vietnamese.txt']


In [3]:
class TextDataset(Dataset):
    '''
    Text Dataset object.
    '''
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.file_names = list(map(lambda x: os.path.join(self.root_dir, x), os.listdir(self.root_dir)))
        self.classes = list(map(lambda x: x.split('.')[0], os.listdir(self.root_dir)))
        
        self.int2label = dict(enumerate(self.classes))
        self.label2int = {v : k for (k, v) in self.int2label.items()}
        
        self.files = [self.read_file(f) for f in self.file_names]
        self.data, self.labels = list(), list()
        
        for file, label in self.files:
            self.data += file
            self.labels += label
        
        self.unique_characters = self.get_unique_chars()
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, ix):
        string = self.data[ix]
        return self.string2tensor(string), self.labels[ix]
    
    def string2tensor(self, string):
        string_data = torch.tensor([self.unique_characters[s] for s in string],
                                   dtype = torch.int32)
        return string_data
    
    def read_file(self, f):
        with open(f, 'rb') as file:
            contents = str(file.read(), encoding = 'utf-8').split('\n')
        
        labels = [self.label2int[f.split('.')[0].split('\\')[-1]]]*len(contents)
        
        return contents[:-1], labels[:-1]
    
    def get_unique_chars(self):
        unique_chars = sorted(list(set(''.join(list(map(lambda x: ''.join(x[0]), self.files))))))
        return {v : k for (k, v) in dict(enumerate(unique_chars)).items()}

In [4]:
data = TextDataset(DIR)

In [5]:
data[0]

(tensor([17, 40, 47, 53, 50, 57], dtype=torch.int32), 0)

In [6]:
data[100][0].shape

torch.Size([5])

In [8]:
data.unique_characters

{' ': 0,
 "'": 1,
 ',': 2,
 '-': 3,
 '/': 4,
 '1': 5,
 ':': 6,
 'A': 7,
 'B': 8,
 'C': 9,
 'D': 10,
 'E': 11,
 'F': 12,
 'G': 13,
 'H': 14,
 'I': 15,
 'J': 16,
 'K': 17,
 'L': 18,
 'M': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'Q': 23,
 'R': 24,
 'S': 25,
 'T': 26,
 'U': 27,
 'V': 28,
 'W': 29,
 'X': 30,
 'Y': 31,
 'Z': 32,
 'a': 33,
 'b': 34,
 'c': 35,
 'd': 36,
 'e': 37,
 'f': 38,
 'g': 39,
 'h': 40,
 'i': 41,
 'j': 42,
 'k': 43,
 'l': 44,
 'm': 45,
 'n': 46,
 'o': 47,
 'p': 48,
 'q': 49,
 'r': 50,
 's': 51,
 't': 52,
 'u': 53,
 'v': 54,
 'w': 55,
 'x': 56,
 'y': 57,
 'z': 58,
 '\xa0': 59,
 'Á': 60,
 'É': 61,
 'ß': 62,
 'à': 63,
 'á': 64,
 'ã': 65,
 'ä': 66,
 'ç': 67,
 'è': 68,
 'é': 69,
 'ê': 70,
 'ì': 71,
 'í': 72,
 'ñ': 73,
 'ò': 74,
 'ó': 75,
 'õ': 76,
 'ö': 77,
 'ù': 78,
 'ú': 79,
 'ü': 80,
 'ą': 81,
 'ł': 82,
 'ń': 83,
 'Ś': 84,
 'Ż': 85,
 'ż': 86}

In [9]:
test = torch.randint(0, 20, size = (32, 5, 10))

In [10]:
embedder = nn.Embedding(100, 3)

In [11]:
embedder(data[0][0])

tensor([[ 0.6601,  0.7327,  0.3939],
        [ 0.7783,  1.2286,  0.0726],
        [ 0.8760,  1.0005,  1.3234],
        [ 2.0991,  0.3014,  0.1014],
        [-0.5784,  0.7654,  1.0474],
        [ 1.0648, -0.1911, -2.0353]], grad_fn=<EmbeddingBackward0>)

In [12]:
embedder(data[100][0])

tensor([[-1.6444,  0.8812, -1.6076],
        [-0.0087, -0.6576, -0.9077],
        [ 0.6525, -0.6735,  1.4135],
        [ 0.6525, -0.6735,  1.4135],
        [-0.2986, -0.9151, -0.1171]], grad_fn=<EmbeddingBackward0>)

In [13]:
r = embedder(test)

In [14]:
r.shape

torch.Size([32, 5, 10, 3])

In [15]:
print(help(nn.RNN))

Help on class RNN in module torch.nn.modules.rnn:

class RNN(RNNBase)
 |  RNN(*args, **kwargs)
 |  
 |  Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
 |  input sequence.
 |  
 |  
 |  For each element in the input sequence, each layer computes the following
 |  function:
 |  
 |  .. math::
 |      h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
 |  
 |  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
 |  the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
 |  previous layer at time `t-1` or the initial hidden state at time `0`.
 |  If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
 |  
 |  Args:
 |      input_size: The number of expected features in the input `x`
 |      hidden_size: The number of features in the hidden state `h`
 |      num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
 |          would mean stacking tw

In [16]:
len(data)

20074

In [17]:
data[0][0]

tensor([17, 40, 47, 53, 50, 57], dtype=torch.int32)

In [18]:
def pad_and_pack(batch):
    data_ = []
    labels = []
    lengths = []
    
    for X, y in batch:
        data_.append(X)
        labels.append(y)
        lengths.append(X.shape[0])
        
    X_pad = torch.nn.utils.rnn.pad_sequence(data_, batch_first = False)
    X_pack = torch.nn.utils.rnn.pack_padded_sequence(X_pad, lengths, batch_first = False, enforce_sorted = False)
    
    return X_pack, torch.tensor(labels, dtype = torch.int64)

In [19]:
d = [(torch.randn(torch.randint(30, 40, (1,)).item(), 16), torch.randint(0, 5, (1,)).item()) for n in range(48)]

In [20]:
for i in d:
    print(i[0].shape)

torch.Size([37, 16])
torch.Size([31, 16])
torch.Size([33, 16])
torch.Size([39, 16])
torch.Size([34, 16])
torch.Size([31, 16])
torch.Size([35, 16])
torch.Size([39, 16])
torch.Size([30, 16])
torch.Size([38, 16])
torch.Size([39, 16])
torch.Size([36, 16])
torch.Size([30, 16])
torch.Size([33, 16])
torch.Size([35, 16])
torch.Size([31, 16])
torch.Size([35, 16])
torch.Size([38, 16])
torch.Size([35, 16])
torch.Size([36, 16])
torch.Size([32, 16])
torch.Size([30, 16])
torch.Size([30, 16])
torch.Size([34, 16])
torch.Size([34, 16])
torch.Size([30, 16])
torch.Size([37, 16])
torch.Size([39, 16])
torch.Size([33, 16])
torch.Size([36, 16])
torch.Size([33, 16])
torch.Size([38, 16])
torch.Size([35, 16])
torch.Size([30, 16])
torch.Size([33, 16])
torch.Size([31, 16])
torch.Size([31, 16])
torch.Size([34, 16])
torch.Size([30, 16])
torch.Size([33, 16])
torch.Size([34, 16])
torch.Size([38, 16])
torch.Size([38, 16])
torch.Size([37, 16])
torch.Size([31, 16])
torch.Size([35, 16])
torch.Size([32, 16])
torch.Size([3

In [21]:
a, b = pad_and_pack(d)

In [22]:
train_sampler, test_sampler = torch.utils.data.random_split(data, lengths = [len(data)-300, 300])

train_dl = DataLoader(train_sampler, batch_size = 8, shuffle = True, collate_fn = pad_and_pack)
test_dl = DataLoader(test_sampler, batch_size = 8, shuffle = False, collate_fn = pad_and_pack)

In [23]:
class PackedEmbedding(nn.Module):
    def __init__(self, embedding_layer):
        super(PackedEmbedding, self).__init__()
        self.embedding = embedding_layer
        
    def forward(self, x):
        if type(x) == torch.nn.utils.rnn.PackedSequence:
            unpacked_sequence, lengths = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first = True)
            y = self.embedding(unpacked_sequence)
            y = torch.nn.utils.rnn.pack_padded_sequence(y, lengths, batch_first = True, enforce_sorted = False)
        else:
            y = self.embedding(x)
        
        return y

In [24]:
class RNNNetwork(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_class, feature_size, p = 0.3):
        super(RNNNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_class = num_class
        self.feature_size = feature_size
        self.p = p
        
        self.embedder = PackedEmbedding(nn.Embedding(self.vocab_size, self.feature_size))
        self.rnn = nn.RNN(input_size = self.feature_size, hidden_size = self.hidden_size, batch_first = True)
        self.fc = nn.Linear(self.hidden_size, self.num_class)
        self.dropout = nn.Dropout(p = self.p)
        
    def forward(self, x):
        x = self.embedder(x)
        output, state = self.rnn(x)
        if type(state) == tuple:
            state = state[0]
        state = state.permute(1, 0, 2).squeeze()
        return torch.log_softmax(self.fc(state), dim = -1)

In [25]:
### Instantiate network
rnn = RNNNetwork(vocab_size = len(data.unique_characters), hidden_size = 256,
                 num_class = len(data.classes), p = 0.3, feature_size = 64)

In [26]:
print(rnn)

RNNNetwork(
  (embedder): PackedEmbedding(
    (embedding): Embedding(87, 64)
  )
  (rnn): RNN(64, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [27]:
from collections import Counter

In [28]:
c = Counter(data.labels)

In [29]:
keys = sorted(dict(c).keys())

In [30]:
keys

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [31]:
w_ = [c[k] for k in keys]
weights = sum(w_)/torch.tensor(w_)

In [32]:
weights

tensor([ 10.0370,  74.9030,  38.6782,  67.5892,   5.4727,  72.4693,  27.7265,
         98.8867,  86.5259,  28.3131,  20.2563, 213.5532, 144.4173, 271.2703,
          2.1337, 200.7400,  67.3624, 274.9863])

In [33]:
torch.tensor(w_)/sum(w_)

tensor([0.0996, 0.0134, 0.0259, 0.0148, 0.1827, 0.0138, 0.0361, 0.0101, 0.0116,
        0.0353, 0.0494, 0.0047, 0.0069, 0.0037, 0.4687, 0.0050, 0.0148, 0.0036])

In [34]:
epochs = 200
lr = 1e-5
criterion = nn.NLLLoss(weight = weights)

In [35]:
### Load checkpoint
CKPT_DIR = os.path.join(os.getcwd().replace('notebooks', 'artefacts'), 'rnn-checkpoint-retrained(2).ckpt')
checkpoint = torch.load(CKPT_DIR)

rnn.load_state_dict(checkpoint['model_state_dict'])

In [36]:
optimizer = optim.Adam([{'params': rnn.embedder.parameters(), 'lr': lr/2},
                        {'params': rnn.rnn.parameters(), 'lr': lr},
                        {'params': rnn.fc.parameters(), 'lr': lr/2}], lr = lr)

scheduler = optim.lr_scheduler.CyclicLR(optimizer, cycle_momentum = False,
                                        base_lr = lr/10, max_lr = lr*10)

optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

scheduler.get_last_lr()

optimizer = optim.SGD([{'params': rnn.embedder.parameters(), 'lr': lr/2},
                       {'params': rnn.rnn.parameters(), 'lr': lr},
                       {'params': rnn.fc.parameters(), 'lr': lr/2}], weight_decay = 0.3,
                      lr = lr)

scheduler = optim.lr_scheduler.CyclicLR(optimizer, cycle_momentum = False,
                                        base_lr = lr/10, max_lr = lr*10)

In [37]:
train_loss = list()
test_loss = list()
train_acc = list()
test_acc = list()

history = dict()

for epoch in range(1, epochs +1):
    rnn.train()
    for X, y in train_dl:
        y_pred = rnn(X)
        
        loss = criterion(y_pred, y)
        acc = accuracy_score(y, torch.exp(y_pred).max(dim = -1).indices)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        train_acc.append(acc)
    
    scheduler.step()
    rnn.eval()
    with torch.no_grad():
        for X_, y_ in test_dl:
            y_p = rnn(X_)
            
            acc = accuracy_score(y_, torch.exp(y_p).max(dim = -1).indices)
            loss = criterion(y_p, y_)
            
            test_loss.append(loss.item())
            test_acc.append(acc)
    
    history[epoch] = dict()
    history[epoch]['train_loss'] = sum(train_loss)/len(train_loss)
    history[epoch]['train_acc'] = sum(train_acc)/len(train_acc)
    
    history[epoch]['test_loss'] = sum(test_loss)/len(test_loss)
    history[epoch]['test_acc'] = sum(test_acc)/len(test_acc)
    
    print(f"Epoch {epoch:03d}/{epochs}:",
          f"\n\tTrain loss -> {history[epoch]['train_loss']: .4f} | Test loss -> {history[epoch]['test_loss']: .4f}")
    print(f"\tTrain accuracy -> {history[epoch]['train_acc']: .4f} | Test accuracy -> {history[epoch]['test_acc']: .4f}")
    
    train_loss.clear()
    test_loss.clear()
    train_acc.clear()
    test_acc.clear()
    

Epoch 001/200: 
	Train loss ->  2.9057 | Test loss ->  2.8904
	Train accuracy ->  0.2198 | Test accuracy ->  0.2632
Epoch 002/200: 
	Train loss ->  2.8615 | Test loss ->  2.8442
	Train accuracy ->  0.2770 | Test accuracy ->  0.2993
Epoch 003/200: 
	Train loss ->  2.8162 | Test loss ->  2.7971
	Train accuracy ->  0.3502 | Test accuracy ->  0.4342
Epoch 004/200: 
	Train loss ->  2.7745 | Test loss ->  2.7498
	Train accuracy ->  0.4532 | Test accuracy ->  0.5066
Epoch 005/200: 
	Train loss ->  2.7317 | Test loss ->  2.7034
	Train accuracy ->  0.4927 | Test accuracy ->  0.5263
Epoch 006/200: 
	Train loss ->  2.6846 | Test loss ->  2.6557
	Train accuracy ->  0.5072 | Test accuracy ->  0.5033
Epoch 007/200: 
	Train loss ->  2.6375 | Test loss ->  2.6078
	Train accuracy ->  0.5112 | Test accuracy ->  0.5132
Epoch 008/200: 
	Train loss ->  2.5925 | Test loss ->  2.5595
	Train accuracy ->  0.5143 | Test accuracy ->  0.5263
Epoch 009/200: 
	Train loss ->  2.5487 | Test loss ->  2.5133
	Train acc

Epoch 072/200: 
	Train loss ->  1.4259 | Test loss ->  1.2734
	Train accuracy ->  0.6949 | Test accuracy ->  0.7434
Epoch 073/200: 
	Train loss ->  1.4239 | Test loss ->  1.2639
	Train accuracy ->  0.6961 | Test accuracy ->  0.7467
Epoch 074/200: 
	Train loss ->  1.4267 | Test loss ->  1.2545
	Train accuracy ->  0.6960 | Test accuracy ->  0.7434
Epoch 075/200: 
	Train loss ->  1.4051 | Test loss ->  1.2460
	Train accuracy ->  0.6979 | Test accuracy ->  0.7500
Epoch 076/200: 
	Train loss ->  1.3943 | Test loss ->  1.2392
	Train accuracy ->  0.7004 | Test accuracy ->  0.7434
Epoch 077/200: 
	Train loss ->  1.3894 | Test loss ->  1.2312
	Train accuracy ->  0.7021 | Test accuracy ->  0.7467
Epoch 078/200: 
	Train loss ->  1.3831 | Test loss ->  1.2258
	Train accuracy ->  0.7045 | Test accuracy ->  0.7434
Epoch 079/200: 
	Train loss ->  1.3782 | Test loss ->  1.2177
	Train accuracy ->  0.7043 | Test accuracy ->  0.7500
Epoch 080/200: 
	Train loss ->  1.3709 | Test loss ->  1.2130
	Train acc

Epoch 143/200: 
	Train loss ->  1.0000 | Test loss ->  0.9217
	Train accuracy ->  0.7678 | Test accuracy ->  0.7961
Epoch 144/200: 
	Train loss ->  0.9913 | Test loss ->  0.9240
	Train accuracy ->  0.7682 | Test accuracy ->  0.7961
Epoch 145/200: 
	Train loss ->  0.9850 | Test loss ->  0.9117
	Train accuracy ->  0.7704 | Test accuracy ->  0.7961
Epoch 146/200: 
	Train loss ->  0.9778 | Test loss ->  0.9044
	Train accuracy ->  0.7729 | Test accuracy ->  0.7928
Epoch 147/200: 
	Train loss ->  0.9807 | Test loss ->  0.8984
	Train accuracy ->  0.7714 | Test accuracy ->  0.7961
Epoch 148/200: 
	Train loss ->  0.9712 | Test loss ->  0.8988
	Train accuracy ->  0.7719 | Test accuracy ->  0.7993
Epoch 149/200: 
	Train loss ->  0.9676 | Test loss ->  0.9102
	Train accuracy ->  0.7727 | Test accuracy ->  0.8026
Epoch 150/200: 
	Train loss ->  0.9555 | Test loss ->  0.9030
	Train accuracy ->  0.7755 | Test accuracy ->  0.8026
Epoch 151/200: 
	Train loss ->  0.9641 | Test loss ->  0.9026
	Train acc

KeyboardInterrupt: 

In [38]:
### Save model checkpoint
checkpoint = {
                'model_state_dict' : rnn.state_dict(),
                'optimizer_state_dict' : optimizer.state_dict(),
                'scheduler_state_dict' : scheduler.state_dict(),
                'epochs' : epochs,
                'history' : history
            }

with open(os.path.join(os.getcwd().replace('notebooks', 'artefacts'),
                       'rnn-checkpoint.ckpt'), 'wb') as file:
    torch.save(checkpoint, file)

In [39]:
def make_predictions(string, model):
    string_tensor = data.string2tensor(string)
    string_tensor = string_tensor.unsqueeze(0)
    
    pred = torch.exp(model(string_tensor))
    pred = pred.max(dim = -1).indices.item()
    
    return data.int2label[pred]

In [49]:
make_predictions('Jose', rnn)

'English'