# Helper Code for Assignment 3 (RNN language models)

## Reading raw text file & Create DataLoader

In [1]:
import os
import torch

In [2]:
class Vocabulary:

    def __init__(self, pad_token="<pad>", unk_token='<unk>'):
        self.id_to_string = {}
        self.string_to_id = {}
        
        # add the default pad token
        self.id_to_string[0] = pad_token
        self.string_to_id[pad_token] = 0
        
        # add the default unknown token
        self.id_to_string[1] = unk_token
        self.string_to_id[unk_token] = 1        
        
        # shortcut access
        self.pad_id = 0
        self.unk_id = 1
        
    def __len__(self):
        return len(self.id_to_string)

    def add_new_word(self, string):
        self.string_to_id[string] = len(self.string_to_id)
        self.id_to_string[len(self.id_to_string)] = string

    # Given a string, return ID
    def get_idx(self, string, extend_vocab=False):
        if string in self.string_to_id:
            return self.string_to_id[string]
        elif extend_vocab:  # add the new word
            self.add_new_word(string)
            return self.string_to_id[string]
        else:
            return self.unk_id


# Read the raw txt file and generate a 1D PyTorch tensor
# containing the whole text mapped to sequence of token IDs, and a vocab object.
class LongTextData:

    def __init__(self, file_path, vocab=None, extend_vocab=True, device='cuda'):
        self.data, self.vocab = self.text_to_data(file_path, vocab, extend_vocab, device)
        
    def __len__(self):
        return len(self.data)

    def text_to_data(self, text_file, vocab, extend_vocab, device):
        """Read a raw text file and create its tensor and the vocab.

        Args:
          text_file: a path to a raw text file.
          vocab: a Vocab object
          extend_vocab: bool, if True extend the vocab
          device: device

        Returns:
          Tensor representing the input text, vocab file

        """
        assert os.path.exists(text_file)
        if vocab is None:
            vocab = Vocabulary()

        data_list = []

        # Construct data
        full_text = []
        print(f"Reading text file from: {text_file}")
        with open(text_file, 'r') as text:
            for line in text:
                tokens = list(line)
                for token in tokens:
                    # get index will extend the vocab if the input
                    # token is not yet part of the text.
                    full_text.append(vocab.get_idx(token, extend_vocab=extend_vocab))

        # convert to tensor
        data = torch.tensor(full_text, device=device, dtype=torch.int64)
        print("Done.")

        return data, vocab
    

# Since there is no need for schuffling the data, we just have to split
# the text data according to the batch size and bptt length.
# The input to be fed to the model will be batch[:-1]
# The target to be used for the loss will be batch[1:]
class ChunkedTextData:

    def __init__(self, data, bsz, bptt_len, pad_id):
        self.batches = self.create_batch(data, bsz, bptt_len, pad_id)

    def __len__(self):
        return len(self.batches)

    def __getitem__(self, idx):
        return self.batches[idx]

    def create_batch(self, input_data, bsz, bptt_len, pad_id):
        """Create batches from a TextData object .

        Args:
          input_data: a TextData object.
          bsz: int, batch size
          bptt_len: int, bptt length
          pad_id: int, ID of the padding token

        Returns:
          List of tensors representing batches

        """
        batches = []  # each element in `batches` is (len, B) tensor
        text_len = len(input_data)
        segment_len = text_len // bsz + 1

        # Question: Explain the next two lines!
        padded = input_data.data.new_full((segment_len * bsz,), pad_id)
        padded[:text_len] = input_data.data
        padded = padded.view(bsz, segment_len).t()
        num_batches = segment_len // bptt_len + 1

        for i in range(num_batches):
            # Prepare batches such that the last symbol of the current batch
            # is the first symbol of the next batch.
            if i == 0:
                # Append a dummy start symbol using pad token
                batch = torch.cat(
                    [padded.new_full((1, bsz), pad_id),
                     padded[i * bptt_len:(i + 1) * bptt_len]], dim=0)
                batches.append(batch)
            else:
                batches.append(padded[i * bptt_len - 1:(i + 1) * bptt_len])

        return batches

In [3]:
# downlaod the text
# Make sure to go to the link and check how the text looks like.

!wget http://www.gutenberg.org/files/49010/49010-0.txt

--2021-11-16 16:32:52--  http://www.gutenberg.org/files/49010/49010-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/files/49010/49010-0.txt [following]
--2021-11-16 16:32:52--  https://www.gutenberg.org/files/49010/49010-0.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 185303 (181K) [text/plain]
Saving to: ‘49010-0.txt’


2021-11-16 16:33:00 (1.76 MB/s) - ‘49010-0.txt’ saved [185303/185303]



In [4]:
# This is for Colab. Adapt the path if needed.

text_path = "/content/49010-0.txt"

In [5]:
DEVICE = 'cuda'

batch_size = 32
bptt_len = 64

my_data = LongTextData(text_path, device=DEVICE)
batches = ChunkedTextData(my_data, batch_size, bptt_len, pad_id=0)

Reading text file from: /content/49010-0.txt
Done.


In [6]:
len(batches)

87

In [10]:
batches[0].shape

torch.Size([65, 32])

In [19]:
# input to the network
print(batches[0][:-1].shape)
batches[0][:-1, 0]

torch.Size([64, 32])


tensor([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10,  5, 11, 12,  6, 13, 14, 12,  5,
        15, 16,  5,  8, 17,  6, 18, 19,  9,  9, 20,  6,  9, 21,  6, 22,  5, 23,
         9, 24, 25, 23,  6, 26, 27, 16, 28,  5, 23, 29,  6, 16, 30,  6, 31, 32,
         6, 33, 32,  6, 34, 12, 35, 11, 20, 15], device='cuda:0')

In [20]:
# target tokens to be predicted
print(batches[0][1:].shape)
batches[0][1:, 0]

torch.Size([64, 32])


tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10,  5, 11, 12,  6, 13, 14, 12,  5, 15,
        16,  5,  8, 17,  6, 18, 19,  9,  9, 20,  6,  9, 21,  6, 22,  5, 23,  9,
        24, 25, 23,  6, 26, 27, 16, 28,  5, 23, 29,  6, 16, 30,  6, 31, 32,  6,
        33, 32,  6, 34, 12, 35, 11, 20, 15,  5], device='cuda:0')

In [25]:
# last token of the current batch should be the first token of the next one:
for i in batches[20][:, 11]:
    print(my_data.vocab.id_to_string[i.item()])
print('================')
for i in batches[21][:, 11]:
    print(my_data.vocab.id_to_string[i.item()])

 
d
i
n
n
e
r
 
f
o
r
 
w
h
i
c
h
 
n
o
t
h
i
n
g
 
w
a
s
 
p
r
o
v
i
d
e
d
 
b
u
t
 
a


s
o
u
p
,
 
w
h
i
c
h
 
w
a
s
 
s
e
r
v
v
e
d
 
o
n
 
a
 
w
i
d
e
,
 
s
h
a
l
l
o
w
 
d
i
s
h
.




T
h
e
 
F
o
x
 
p
r
e
s
i
d
e
d
 
a
t
 
t
h
e
 
f
e
a
s
t
 
w
i
t
h
 


In [8]:
print(my_data.vocab.id_to_string)
print(my_data.vocab.string_to_id)

{0: '<pad>', 1: '<unk>', 2: '\ufeff', 3: 'T', 4: 'h', 5: 'e', 6: ' ', 7: 'P', 8: 'r', 9: 'o', 10: 'j', 11: 'c', 12: 't', 13: 'G', 14: 'u', 15: 'n', 16: 'b', 17: 'g', 18: 'E', 19: 'B', 20: 'k', 21: 'f', 22: 'A', 23: 's', 24: 'p', 25: "'", 26: 'F', 27: 'a', 28: 'l', 29: ',', 30: 'y', 31: 'J', 32: '.', 33: 'H', 34: 'S', 35: 'i', 36: '\n', 37: 'w', 38: 'U', 39: 'd', 40: 'm', 41: 'v', 42: 'Y', 43: '-', 44: 'L', 45: 'I', 46: ':', 47: 'V', 48: 'R', 49: 'C', 50: 'D', 51: 'M', 52: '2', 53: '1', 54: '0', 55: '5', 56: '[', 57: '#', 58: '4', 59: '9', 60: ']', 61: '8', 62: '*', 63: 'O', 64: 'N', 65: 'K', 66: '/', 67: 'W', 68: 'X', 69: '(', 70: '3', 71: ')', 72: 'Æ', 73: '’', 74: '_', 75: '—', 76: 'æ', 77: '·', 78: 'z', 79: 'x', 80: 'q', 81: ';', 82: '6', 83: '7', 84: 'Q', 85: 'œ', 86: 'μ', 87: 'α', 88: 'λ', 89: 'ο', 90: 'ν', 91: 'ὁ', 92: 'Φ', 93: 'ρ', 94: 'ὑ', 95: 'ξ', 96: '“', 97: '”', 98: '?', 99: '!', 100: '‘', 101: 'é', 102: 'è', 103: '"', 104: '%', 105: '@', 106: '$'}
{'<pad>': 0, '<unk>': 1, 

# Taking the argmax vs. Sampling from a distribution

In [9]:
# Let's consider a "dice" with five faces a following probability:
# 0: 0.2
# 1: 0.1
# 2: 0.4
# 3: 0.2
# 4: 0.1

dice = torch.tensor([0.2, 0.1, 0.4, 0.2, 0.1], device=DEVICE)

In [None]:
# Sampling = roll dice
num_rolls = 5

for i in range(num_rolls):
    print(torch.multinomial(dice, num_samples=1))

tensor([2], device='cuda:0')
tensor([4], device='cuda:0')
tensor([2], device='cuda:0')
tensor([2], device='cuda:0')
tensor([2], device='cuda:0')


In [None]:
# Take the face with the highest probability

num_rolls = 5
for i in range(num_rolls):
    values, indices = torch.topk(dice, k=1, dim=-1)
    print(indices)

tensor([2], device='cuda:0')
tensor([2], device='cuda:0')
tensor([2], device='cuda:0')
tensor([2], device='cuda:0')
tensor([2], device='cuda:0')
