In [49]:
import numpy as np
from torch import nn
import torch.nn.functional as F
import io
import torch
import codecs
from torch.utils.data import Dataset, DataLoader

import copy
from tqdm.notebook import tqdm
from prettytable import PrettyTable

Reading the corpora, and breaking the lines into train and eval

In [50]:
path = "./data/shekspear.txt"
sentences = []
with io.open(path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip().lower()
        if line:
            sentences.append(line)

In [51]:
eval_sentences = sentences[-1000:]
train_sentences = sentences[:-1000]

The function below, gets a line and convert each character in the line to its Unicode intiger. 
EOS_int represent the end of sentences, and we refer to it as 1.

In [52]:
def line_to_tensor(line, EOS_int=2, BOS_int = 1):
    """Turns a line of text into a tensor

    Args:
        line (str): A single line of text.
        EOS_int (int, optional): End-of-sentence integer. Defaults to 1.

    Returns:
        list: a list of integers (unicode values) for the characters in the `line`.
    """
    tensor = []
    for char in line:
        char_int = ord(char)
        tensor.append(char_int)
        
    tensor.append(EOS_int)
    
    return tensor

In [53]:
print(f"Tensor represntation of character for ALI is equal to: {line_to_tensor('ali')}")

Tensor represntation of character for ALI is equal to: [97, 108, 105, 2]


Dataset class to generate batches of sentences with maximum length of max_length, padding and a masked tensor to identify which cell represent a character and which is a padding cell. 
EOS = 2
BOS = 1 # added in getitem function


In [63]:
class textBatch(Dataset):
    
        def __init__(self, sentences, max_length):
            self.line_tensor = []
            for line in sentences:
                tensor = self.line_to_tensor(line)
                pad = [0] *(max_length-len(tensor))
                paded_tensor = tensor + pad
                mask = [1 if el>0 else 0 for el in paded_tensor]
                self.line_tensor.append((paded_tensor,mask))
            
        def __len__(self):
            #len(dataset)
            return len(self.line_tensor)
        
        def __getitem__(self, index):
            
            paded_tensor, mask = self.line_tensor[index]
            paded_tensor = torch.tensor(paded_tensor).view(1,-1)
            input_sen = torch.zeros(paded_tensor.size()).view(1,-1)
            input_sen[0,0] = 1
            input_sen[0,1:] = paded_tensor[0,:-1]
            return input_sen , paded_tensor , torch.tensor(mask)
        
        def line_to_tensor(self,line, EOS_int=2):
            """Turns a line of text into a tensor

            Args:
                line (str): A single line of text.
                EOS_int (int, optional): End-of-sentence integer. Defaults to 1.

            Returns:
                list: a list of integers (unicode values) for the characters in the `line`.
            """
            tensor = []
            for char in line:
                char_int = ord(char)
                tensor.append(char_int)

            tensor.append(EOS_int)
            return tensor

In [64]:
data = textBatch(['hey man', 'hello dear', 'honey'], 15)
train_loader = DataLoader(data, batch_size=2, shuffle=True)
for x,y,z in train_loader:
    print(x)
    print(y)
    break

tensor([[[  1., 104., 101., 108., 108., 111.,  32., 100., 101.,  97., 114.,
            2.,   0.,   0.,   0.]],

        [[  1., 104., 111., 110., 101., 121.,   2.,   0.,   0.,   0.,   0.,
            0.,   0.,   0.,   0.]]])
tensor([[[104, 101, 108, 108, 111,  32, 100, 101,  97, 114,   2,   0,   0,   0,
            0]],

        [[104, 111, 110, 101, 121,   2,   0,   0,   0,   0,   0,   0,   0,   0,
            0]]])


The architecture of the character level GRU lanuage model

In [65]:
class GRULM(nn.Module):
     def __init__(self, dim, n_layer,h_units, vocab_size, batch_size, gpu = 0, ):
            super(GRULM, self).__init__()
            self.dim = dim
            self.num_layers = n_layer
            self.vocab_size = vocab_size
            self.hidden_units = h_units
            self.batch_size = batch_size
            self.embed = nn.Embedding(self.vocab_size,self.dim)
            self.GRU = nn.GRU(self.dim,self.hidden_units,self.num_layers,batch_first=True)
            self.lin= nn.Linear(self.hidden_units, self.vocab_size)
    
     def forward(self, x, hidden):
            emb = self.embed(x)
            output, hidden1 = self.GRU(emb.unsqueeze(1),hidden)
            out1 = self.lin(output)

            return out1, hidden1
    
     def init_hidden(self, batch_size):
        
         return  torch.zeros(self.num_layers, batch_size, self.hidden_units)
     
     def evaluate(self, dev_loader):
        
        self.train(mode=False)
        with torch.no_grad():
            losses = []
            total = 0
            for data, target, masked in dev_loader:
                
                data = data.squeeze()
                target = target.squeeze()
                masked = masked.squeeze()
                batch_loss = 0
                
                if len(data.size()) == 1:
                    continue
                    
                hidden = self.init_hidden(data.size(0))
                bs_total = 0
                for charid  in range(data.size(1)) :
                    output, hidden = model(data[:,charid].long(),hidden)
                    target1 = target[:,charid].long()
                    batch_loss += criterion(output.squeeze(), target1)
                    bs_total += 1
                    
                avg_loss = batch_loss.item()/bs_total
                losses.append(avg_loss)
                total += 1
        epoch_loss = sum(losses) / total
        return epoch_loss

Number of batches and max length, and then computing the number of batches in the training set

In [66]:
batch_size = 32
max_length = 64

In [67]:
def num_lin_in_training(sentences, batch_size):
    index = 0
    for line in sentences:
        if len(line) < max_length:
            index +=1
    return index

In [68]:
num_used_lines = num_lin_in_training(train_sentences, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 31777
Batch size (a power of 2): 32
Number of steps to cover one epoch: 993


Initializing the train and eval loaders

In [72]:
data = textBatch(train_sentences, max_length)
train_loader = DataLoader(data, batch_size=batch_size, shuffle=True)
eval_data  = textBatch(eval_sentences, max_length)
eval_loader = DataLoader(eval_data, batch_size=batch_size)

The train function

In [73]:
def train_gru(model,trainloader, optim):
    
    losses = []
    total = 0
    
    for data, target, masked in train_loader:
        model.zero_grad()
        model.train(mode=True)
        
        data = data.squeeze()
        target = target.squeeze()
        masked = masked.squeeze()


        batch_loss = 0
        if len(data.size()) == 1:
            continue
                
        batch_size = data.size(0)
        hidden = model.init_hidden(batch_size)
        
        for charid  in range(data.size(1)) :
            output, hidden = model(data[:,charid].long(),hidden)
            target1 = target[:,charid].long()
            batch_loss += criterion(output.squeeze(), target1)
        
        
        batch_loss.backward()
        optim.step()
        avg_loss = batch_loss.item()/data.size(1)
        losses.append(avg_loss)
        total += 1
    
    return losses, total

Training procedure


In [74]:
np.random.seed(1)
torch.manual_seed(1)
model = GRULM(dim=100,n_layer=1,h_units=50, vocab_size=256, batch_size=batch_size, gpu = 0, )
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001,
                       weight_decay=0.0,
                       betas=(0.9, 0.999),
                       eps=1e-8,
                       amsgrad=False)


epoch = 20
least_loss = 1000
train_loss = []
eval_loss = []
for ep in range(epoch):
    
    print('Ep {:4d}'.format(ep), end='')

    losses, total = train_gru(model, train_loader, optim)
    epoch_loss = sum(losses) / total
    train_loss.append(epoch_loss)
    
    print(' |Train loss {:4f}'.format(epoch_loss), end='')
    print(' |Train PPL {:4f}'.format(np.exp(epoch_loss)), end='')
    
    evaluate_loss = model.evaluate(eval_loader)
    eval_loss.append(evaluate_loss)
    
    print(' |Evaluation loss {:4f}'.format(evaluate_loss), end='')
    print(' |Evaluation PPL {:4f}'.format(np.exp(evaluate_loss)), end='')
    
    if least_loss > evaluate_loss:
        least_loss = evaluate_loss
        torch.save(model.state_dict(), './GRULM.pth')
        best_model = copy.deepcopy(model)
        print('|Saved\n')
    else:
        print("\n")


Ep    0 |Train loss 1.376997 |Train PPL 3.962983 |Evaluation loss 0.999870 |Evaluation PPL 2.717927|Saved

Ep    1 |Train loss 1.067120 |Train PPL 2.906995 |Evaluation loss 0.945112 |Evaluation PPL 2.573101|Saved

Ep    2 |Train loss 1.012444 |Train PPL 2.752318 |Evaluation loss 0.923221 |Evaluation PPL 2.517385|Saved

Ep    3 |Train loss 0.977649 |Train PPL 2.658200 |Evaluation loss 0.902607 |Evaluation PPL 2.466025|Saved

Ep    4 |Train loss 0.951713 |Train PPL 2.590142 |Evaluation loss 0.890607 |Evaluation PPL 2.436608|Saved

Ep    5 |Train loss 0.935696 |Train PPL 2.548986 |Evaluation loss 0.886302 |Evaluation PPL 2.426140|Saved

Ep    6 |Train loss 0.924607 |Train PPL 2.520877 |Evaluation loss 0.882478 |Evaluation PPL 2.416881|Saved

Ep    7 |Train loss 0.915676 |Train PPL 2.498465 |Evaluation loss 0.880973 |Evaluation PPL 2.413248|Saved

Ep    8 |Train loss 0.908777 |Train PPL 2.481285 |Evaluation loss 0.876345 |Evaluation PPL 2.402104|Saved

Ep    9 |Train loss 0.902846 |Train P

NameError: name 'writer' is not defined

In [198]:
np.random.seed(1)
torch.manual_seed(1)
model = GRULM(dim=100,n_layer=2,h_units=50, vocab_size=256, batch_size=batch_size, gpu = 0, )
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001,
                       weight_decay=0.0,
                       betas=(0.9, 0.999),
                       eps=1e-8,
                       amsgrad=False)

writer = SummaryWriter("/Users/cons13411/PycharmProjects/deep N-Gram/")


epoch = 20
least_loss = 1000
train_loss = []
eval_loss = []
for ep in range(epoch):
    print('Ep {:4d}'.format(ep), end='')
    losses = []
    total = 0
    for data, target, masked in train_loader:
        model.zero_grad()
        model.train(mode=True)
        data = data.squeeze()
        target = target.squeeze()
        masked = masked.squeeze()


        batch_loss = 0
        if len(data.size()) == 1:
            continue
                
        batch_size = data.size(0)
        hidden = model.init_hidden(batch_size)
        
        for charid  in range(data.size(1)) :
            output, hidden = model(data[:,charid].long(),hidden)
            target1 = target[:,charid].long()
            batch_loss += criterion(output.squeeze(), target1)
            
        batch_loss.backward()
        optim.step()
        avg_loss = batch_loss.item()/data.size(1)
        losses.append(avg_loss)
        total += 1
    epoch_loss = sum(losses) / total
    train_loss.append(epoch_loss)
    print(' |Train loss {:4f}'.format(epoch_loss), end='')
    print(' |Train PPL {:4f}'.format(np.exp(epoch_loss)), end='')
    evaluate_loss = model.evaluate(eval_loader)
    eval_loss.append(evaluate_loss)
    print(' |Evaluation loss {:4f}'.format(evaluate_loss), end='')
    print(' |Evaluation PPL {:4f}'.format(np.exp(evaluate_loss)), end='')
    
    if least_loss > evaluate_loss:
        least_loss = evaluate_loss
        torch.save(model.state_dict(), './GRULM.pth')
        best_model = copy.deepcopy(model)
        print('|Saved\n')
    else:
        print("\n")

writer.flush()

Ep    0 |Train loss 1.394333 |Train PPL 4.032284 |Evaluation loss 0.978091 |Evaluation PPL 2.659373|Saved

Ep    1 |Train loss 1.019600 |Train PPL 2.772086 |Evaluation loss 0.905296 |Evaluation PPL 2.472663|Saved

Ep    2 |Train loss 0.947721 |Train PPL 2.579823 |Evaluation loss 0.883871 |Evaluation PPL 2.420251|Saved

Ep    3 |Train loss 0.912410 |Train PPL 2.490317 |Evaluation loss 0.871749 |Evaluation PPL 2.391090|Saved

Ep    4 |Train loss 0.890298 |Train PPL 2.435854 |Evaluation loss 0.862656 |Evaluation PPL 2.369446|Saved

Ep    5 |Train loss 0.875049 |Train PPL 2.398994 |Evaluation loss 0.856866 |Evaluation PPL 2.355767|Saved

Ep    6 |Train loss 0.863312 |Train PPL 2.371000 |Evaluation loss 0.847551 |Evaluation PPL 2.333924|Saved

Ep    7 |Train loss 0.853533 |Train PPL 2.347928 |Evaluation loss 0.847126 |Evaluation PPL 2.332932|Saved

Ep    8 |Train loss 0.845998 |Train PPL 2.330302 |Evaluation loss 0.842466 |Evaluation PPL 2.322087|Saved

Ep    9 |Train loss 0.839131 |Train P

generating sequences. 

In [86]:
seed  = 'shall i'
txt = 'shall i'
bt_size = 1
temperature = 1
hidden_state = best_model.init_hidden(bt_size)

charids = []
charids.append(1)
for char in seed:
    charids.append(ord(char))

with torch.no_grad():
    for charid in charids:
        charid = torch.tensor([charid])
        output, hidden_state = best_model(charid, hidden_state)
        
        distribution = output.squeeze().div(temperature).exp()
        guess = torch.multinomial(distribution, 1).item()
        

    txt += chr(guess)
    while guess != 2:
        charid = torch.tensor([guess])
        output, hidden_state = best_model(charid, hidden_state)

        probs = F.softmax(output, 2)
        probs, picked_indexes = probs.topk(10)
        picked_indexes = picked_indexes.numpy().squeeze()
        probs = probs.numpy().flatten()
        probs = probs / probs.sum()
        guess = np.random.choice(picked_indexes, p=probs)
        
        txt += chr(guess)
        
#         if guess == 2:
#             txt += '\n'
#             guess = np.random.randint(97, high=122)
#             hidden_state = best_model.init_hidden(bt_size)
        #print(guess)
print(txt)

shall in my home immond stiel wiper,


Number of parameters in the network

In [87]:
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [88]:
count_parameters(best_model)

+------------------+------------+
|     Modules      | Parameters |
+------------------+------------+
|   embed.weight   |   25600    |
| GRU.weight_ih_l0 |   15000    |
| GRU.weight_hh_l0 |    7500    |
|  GRU.bias_ih_l0  |    150     |
|  GRU.bias_hh_l0  |    150     |
|    lin.weight    |   12800    |
|     lin.bias     |    256     |
+------------------+------------+
Total Trainable Params: 61456


61456