In [1]:
import os
import numpy as np

import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [2]:
### Data files
DIR = os.getcwd().replace('notebooks', 'data')
print(os.listdir(DIR))

['Arabic.txt', 'Chinese.txt', 'Czech.txt', 'Dutch.txt', 'English.txt', 'French.txt', 'German.txt', 'Greek.txt', 'Irish.txt', 'Italian.txt', 'Japanese.txt', 'Korean.txt', 'Polish.txt', 'Portuguese.txt', 'Russian.txt', 'Scottish.txt', 'Spanish.txt', 'Vietnamese.txt']


In [3]:
class TextDataset(Dataset):
    '''
    Text Dataset object.
    '''
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.file_names = list(map(lambda x: os.path.join(self.root_dir, x), os.listdir(self.root_dir)))
        self.classes = list(map(lambda x: x.split('.')[0], os.listdir(self.root_dir)))
        
        self.int2label = dict(enumerate(self.classes))
        self.label2int = {v : k for (k, v) in self.int2label.items()}
        
        self.files = [self.read_file(f) for f in self.file_names]
        self.data, self.labels = list(), list()
        
        for file, label in self.files:
            self.data += file
            self.labels += label
        
        self.unique_characters = self.get_unique_chars()
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, ix):
        string = self.data[ix]
        return self.string2tensor(string), self.labels[ix]
    
    def string2tensor(self, string):
        string_data = torch.tensor([self.unique_characters[s] for s in string],
                                   dtype = torch.int32)
        return string_data
    
    def read_file(self, f):
        with open(f, 'rb') as file:
            contents = str(file.read(), encoding = 'utf-8').split('\n')
        
        labels = [self.label2int[f.split('.')[0].split('\\')[-1]]]*len(contents)
        
        return contents[:-1], labels[:-1]
    
    def get_unique_chars(self):
        unique_chars = sorted(list(set(''.join(list(map(lambda x: ''.join(x[0]), self.files))))))
        return {v : k for (k, v) in dict(enumerate(unique_chars)).items()}

In [4]:
data = TextDataset(DIR)

In [5]:
data[0]

(tensor([17, 40, 47, 53, 50, 57], dtype=torch.int32), 0)

In [6]:
data[100][0].shape

torch.Size([5])

In [7]:
data.unique_characters

{' ': 0,
 "'": 1,
 ',': 2,
 '-': 3,
 '/': 4,
 '1': 5,
 ':': 6,
 'A': 7,
 'B': 8,
 'C': 9,
 'D': 10,
 'E': 11,
 'F': 12,
 'G': 13,
 'H': 14,
 'I': 15,
 'J': 16,
 'K': 17,
 'L': 18,
 'M': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'Q': 23,
 'R': 24,
 'S': 25,
 'T': 26,
 'U': 27,
 'V': 28,
 'W': 29,
 'X': 30,
 'Y': 31,
 'Z': 32,
 'a': 33,
 'b': 34,
 'c': 35,
 'd': 36,
 'e': 37,
 'f': 38,
 'g': 39,
 'h': 40,
 'i': 41,
 'j': 42,
 'k': 43,
 'l': 44,
 'm': 45,
 'n': 46,
 'o': 47,
 'p': 48,
 'q': 49,
 'r': 50,
 's': 51,
 't': 52,
 'u': 53,
 'v': 54,
 'w': 55,
 'x': 56,
 'y': 57,
 'z': 58,
 '\xa0': 59,
 'Á': 60,
 'É': 61,
 'ß': 62,
 'à': 63,
 'á': 64,
 'ã': 65,
 'ä': 66,
 'ç': 67,
 'è': 68,
 'é': 69,
 'ê': 70,
 'ì': 71,
 'í': 72,
 'ñ': 73,
 'ò': 74,
 'ó': 75,
 'õ': 76,
 'ö': 77,
 'ù': 78,
 'ú': 79,
 'ü': 80,
 'ą': 81,
 'ł': 82,
 'ń': 83,
 'Ś': 84,
 'Ż': 85,
 'ż': 86}

In [8]:
test = torch.randint(0, 20, size = (32, 5, 10))

In [9]:
embedder = nn.Embedding(100, 3)

In [10]:
embedder(data[0][0])

tensor([[-0.2000,  0.6298, -0.6109],
        [-0.5753,  0.0664, -1.2443],
        [-0.0508, -1.2074, -2.2261],
        [-0.7296,  0.5501,  0.2357],
        [-0.1325,  0.6226,  2.3006],
        [-1.6157, -0.1776,  1.3665]], grad_fn=<EmbeddingBackward0>)

In [11]:
embedder(data[100][0])

tensor([[ 0.7782,  1.7758, -0.3632],
        [-0.2005, -1.2874,  0.6606],
        [ 0.5798,  0.6580,  0.9907],
        [ 0.5798,  0.6580,  0.9907],
        [-0.1711,  0.9407,  0.2519]], grad_fn=<EmbeddingBackward0>)

In [12]:
r = embedder(test)

In [13]:
r.shape

torch.Size([32, 5, 10, 3])

In [14]:
print(help(nn.RNN))

Help on class RNN in module torch.nn.modules.rnn:

class RNN(RNNBase)
 |  RNN(*args, **kwargs)
 |  
 |  Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
 |  input sequence.
 |  
 |  
 |  For each element in the input sequence, each layer computes the following
 |  function:
 |  
 |  .. math::
 |      h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
 |  
 |  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
 |  the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
 |  previous layer at time `t-1` or the initial hidden state at time `0`.
 |  If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
 |  
 |  Args:
 |      input_size: The number of expected features in the input `x`
 |      hidden_size: The number of features in the hidden state `h`
 |      num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
 |          would mean stacking tw

In [15]:
len(data)

20074

In [16]:
data[0][0]

tensor([17, 40, 47, 53, 50, 57], dtype=torch.int32)

In [17]:
def pad_and_pack(batch, sort = False, target = True):
    data_ = []
    labels = []
    lengths = []
    
    for X, y in batch:
        data_.append(X)
        if target:
            labels.append(y)
        lengths.append(X.shape[0])
        
    X_pad = torch.nn.utils.rnn.pad_sequence(data_, batch_first = False)
    X_pack = torch.nn.utils.rnn.pack_padded_sequence(X_pad, lengths, batch_first = False, enforce_sorted = sort)
    
    return (X_pack, torch.tensor(labels, dtype = torch.int64)) if target else X_pack

In [18]:
d = [(torch.randn(torch.randint(30, 40, (1,)).item(), 16), torch.randint(0, 5, (1,)).item()) for n in range(48)]

In [19]:
for i in d:
    print(i[0].shape)

torch.Size([36, 16])
torch.Size([37, 16])
torch.Size([32, 16])
torch.Size([33, 16])
torch.Size([39, 16])
torch.Size([32, 16])
torch.Size([33, 16])
torch.Size([32, 16])
torch.Size([32, 16])
torch.Size([36, 16])
torch.Size([32, 16])
torch.Size([30, 16])
torch.Size([37, 16])
torch.Size([32, 16])
torch.Size([30, 16])
torch.Size([35, 16])
torch.Size([32, 16])
torch.Size([35, 16])
torch.Size([37, 16])
torch.Size([38, 16])
torch.Size([31, 16])
torch.Size([35, 16])
torch.Size([35, 16])
torch.Size([34, 16])
torch.Size([36, 16])
torch.Size([38, 16])
torch.Size([30, 16])
torch.Size([37, 16])
torch.Size([38, 16])
torch.Size([38, 16])
torch.Size([30, 16])
torch.Size([33, 16])
torch.Size([35, 16])
torch.Size([37, 16])
torch.Size([37, 16])
torch.Size([33, 16])
torch.Size([39, 16])
torch.Size([37, 16])
torch.Size([32, 16])
torch.Size([31, 16])
torch.Size([35, 16])
torch.Size([30, 16])
torch.Size([32, 16])
torch.Size([37, 16])
torch.Size([39, 16])
torch.Size([34, 16])
torch.Size([31, 16])
torch.Size([3

In [20]:
train_sampler, test_sampler = torch.utils.data.random_split(data, lengths = [len(data)-300, 300])

train_dl = DataLoader(train_sampler, batch_size = 8, shuffle = True, collate_fn = pad_and_pack)
test_dl = DataLoader(test_sampler, batch_size = 8, shuffle = False, collate_fn = pad_and_pack)

In [21]:
class PackedEmbedding(nn.Module):
    def __init__(self, embedding_layer):
        super(PackedEmbedding, self).__init__()
        
        self.embedding = embedding_layer
        
    def forward(self, x):
        if type(x) == torch.nn.utils.rnn.PackedSequence:
            unpacked_sequence, lengths = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first = True)
            y = self.embedding(unpacked_sequence)
            y = torch.nn.utils.rnn.pack_padded_sequence(y, lengths, batch_first = True, enforce_sorted = False)
        else:
            y = self.embedding(x)
        
        return y

In [22]:
class RNNNetwork(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_class, feature_size, p = 0.3):
        super(RNNNetwork, self).__init__()
        
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_class = num_class
        self.feature_size = feature_size
        self.p = p
        
        self.embedder = PackedEmbedding(nn.Embedding(self.vocab_size, self.feature_size))
        self.rnn = nn.RNN(input_size = self.feature_size, hidden_size = self.hidden_size, batch_first = True)
        self.fc = nn.Linear(self.hidden_size, self.num_class)
        self.dropout = nn.Dropout(p = self.p)
        
    def forward(self, x):
        x = self.embedder(x)
        output, state = self.rnn(x)
        if type(state) == tuple:
            state = state[0]
        state = state.permute(1, 0, 2).squeeze()
        return torch.log_softmax(self.fc(state), dim = -1)

In [23]:
### Instantiate network
rnn = RNNNetwork(vocab_size = len(data.unique_characters), hidden_size = 256,
                 num_class = len(data.classes), p = 0.3, feature_size = 64)

In [24]:
print(rnn)

RNNNetwork(
  (embedder): PackedEmbedding(
    (embedding): Embedding(87, 64)
  )
  (rnn): RNN(64, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [25]:
from collections import Counter
c = Counter(data.labels)

In [26]:
c

Counter({0: 2000,
         1: 268,
         2: 519,
         3: 297,
         4: 3668,
         5: 277,
         6: 724,
         7: 203,
         8: 232,
         9: 709,
         10: 991,
         11: 94,
         12: 139,
         13: 74,
         14: 9408,
         15: 100,
         16: 298,
         17: 73})

In [27]:
keys = sorted(dict(c).keys())

In [28]:
keys

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [29]:
w_ = [c[k] for k in keys]
weights = sum(w_)/torch.tensor(w_)

In [30]:
weights

tensor([ 10.0370,  74.9030,  38.6782,  67.5892,   5.4727,  72.4693,  27.7265,
         98.8867,  86.5259,  28.3131,  20.2563, 213.5532, 144.4173, 271.2703,
          2.1337, 200.7400,  67.3624, 274.9863])

In [31]:
torch.tensor(w_)/sum(w_)

tensor([0.0996, 0.0134, 0.0259, 0.0148, 0.1827, 0.0138, 0.0361, 0.0101, 0.0116,
        0.0353, 0.0494, 0.0047, 0.0069, 0.0037, 0.4687, 0.0050, 0.0148, 0.0036])

In [32]:
epochs = 50
lr = 5e-7
criterion = nn.NLLLoss(weight = weights)

In [33]:
### Load checkpoint
CKPT_DIR = os.path.join(os.getcwd().replace('notebooks', 'artefacts'), 'rnn-checkpoint.ckpt')
checkpoint = torch.load(CKPT_DIR)

In [34]:
rnn.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [35]:
optimizer = optim.Adam([{'params': rnn.embedder.parameters(), 'lr': lr/2},
                       {'params': rnn.rnn.parameters(), 'lr': lr},
                       {'params': rnn.fc.parameters(), 'lr': lr/2}], lr = lr)

scheduler = optim.lr_scheduler.CyclicLR(optimizer, cycle_momentum = False,
                                        base_lr = lr/10, max_lr = lr*10)

In [36]:
train_loss = list()
test_loss = list()
train_acc = list()
test_acc = list()

history = dict()

for epoch in range(1, epochs +1):
    rnn.train()
    for X, y in train_dl:
        y_pred = rnn(X)
        
        loss = criterion(y_pred, y)
        acc = accuracy_score(y, torch.exp(y_pred).max(dim = -1).indices)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        train_acc.append(acc)
    
    scheduler.step()
    rnn.eval()
    with torch.no_grad():
        for X_, y_ in test_dl:
            y_p = rnn(X_)
            
            acc = accuracy_score(y_, torch.exp(y_p).max(dim = -1).indices)
            loss = criterion(y_p, y_)
            
            test_loss.append(loss.item())
            test_acc.append(acc)
    
    history[epoch] = dict()
    history[epoch]['train_loss'] = sum(train_loss)/len(train_loss)
    history[epoch]['train_acc'] = sum(train_acc)/len(train_acc)
    
    history[epoch]['test_loss'] = sum(test_loss)/len(test_loss)
    history[epoch]['test_acc'] = sum(test_acc)/len(test_acc)
    
    print(f"Epoch {epoch:03d}/{epochs}:",
          f"\n\tTrain loss -> {history[epoch]['train_loss']: .4f} | Test loss -> {history[epoch]['test_loss']: .4f}")
    print(f"\tTrain accuracy -> {history[epoch]['train_acc']: .4f} | Test accuracy -> {history[epoch]['test_acc']: .4f}")
    
    train_loss.clear()
    test_loss.clear()
    train_acc.clear()
    test_acc.clear()
    

Epoch 001/50: 
	Train loss ->  0.7592 | Test loss ->  0.7278
	Train accuracy ->  0.8064 | Test accuracy ->  0.8224
Epoch 002/50: 
	Train loss ->  0.7663 | Test loss ->  0.7279
	Train accuracy ->  0.8068 | Test accuracy ->  0.8257
Epoch 003/50: 
	Train loss ->  0.7597 | Test loss ->  0.7282
	Train accuracy ->  0.8067 | Test accuracy ->  0.8257
Epoch 004/50: 
	Train loss ->  0.7577 | Test loss ->  0.7283
	Train accuracy ->  0.8067 | Test accuracy ->  0.8257
Epoch 005/50: 
	Train loss ->  0.7566 | Test loss ->  0.7285
	Train accuracy ->  0.8068 | Test accuracy ->  0.8224
Epoch 006/50: 
	Train loss ->  0.7554 | Test loss ->  0.7287
	Train accuracy ->  0.8066 | Test accuracy ->  0.8257
Epoch 007/50: 
	Train loss ->  0.7556 | Test loss ->  0.7288
	Train accuracy ->  0.8062 | Test accuracy ->  0.8224
Epoch 008/50: 
	Train loss ->  0.7588 | Test loss ->  0.7290
	Train accuracy ->  0.8061 | Test accuracy ->  0.8257
Epoch 009/50: 
	Train loss ->  0.7546 | Test loss ->  0.7290
	Train accuracy -> 

In [37]:
### Save model checkpoint
checkpoint = {
                'model_state_dict' : rnn.state_dict(),
                'optimizer_state_dict' : optimizer.state_dict(),
                'scheduler_state_dict' : scheduler.state_dict(),
                'epochs' : epochs,
                'history' : history
            }

with open(os.path.join(os.getcwd().replace('notebooks', 'artefacts'),
                       'rnn-checkpoint-retrained-(1).ckpt'), 'wb') as file:
    torch.save(checkpoint, file)

In [39]:
def make_predictions(string, model):
    if type(string) == str or len(string) == 1:
        string_tensor = data.string2tensor(string)
        X_ = string_tensor.unsqueeze(0)
    else:
        string_tensor = [data.string2tensor(s) for s in string]
        lengths = [len(s) for s in string]
        X_ = pad_and_pack(string, sort = True)
    
    pred = torch.exp(model(X_))
    pred = pred.max(dim = -1).indices.item()
    
    return data.int2label[pred] if type(string) == str or len(string) == 1 else dict(zip(string, pred))

In [92]:
def make_predictions(string, model):
    if type(string) == str or len(string) == 1:
        string_tensor = data.string2tensor(string)
        X_ = string_tensor.unsqueeze(0)
        pred = torch.exp(model(X_))
        pred = pred.max(dim = -1).indices.item()
        pred = dict(string = data.int2label[pred])
    else:
        string_tensor = [data.string2tensor(s) for s in string]
        lengths = [len(s) for s in string]
        X_ = torch.nn.utils.rnn.pad_sequence(string_tensor, batch_first = True)
        X_ = torch.nn.utils.rnn.pack_padded_sequence(X_, lengths = lengths, enforce_sorted = False, batch_first = True)
        pred = torch.exp(model(X_))
        pred = pred.max(dim = -1).indices
        pred = [data.int2label[p.item()] for p in pred]
        pred = dict(zip(string, pred))
    
    
    return pred

In [103]:
data.label2int

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [93]:
make_predictions('Jose', rnn)

{'string': 'English'}

In [136]:
make_predictions(['Nobunaga', 'sakai', 'hasim'], rnn)

{'Nobunaga': 'Japanese', 'sakai': 'Japanese', 'hasim': 'Arabic'}