In [11]:
from nltk.tokenize import word_tokenize
import torch

In [12]:
# nltk.download() # if not install the nltk library then uncomment this line

In [6]:
text = 'I love this flavor! It\'s by far the best choice and my go-to whenever I go to the grocery store. I wish they would restock it more often though.'

In [7]:
word_tokens = word_tokenize(text)
print(word_tokens)

['I', 'love', 'this', 'flavor', '!', 'It', "'s", 'by', 'far', 'the', 'best', 'choice', 'and', 'my', 'go-to', 'whenever', 'I', 'go', 'to', 'the', 'grocery', 'store', '.', 'I', 'wish', 'they', 'would', 'restock', 'it', 'more', 'often', 'though', '.']


In [8]:
with open('../data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [19]:
words = sorted(list(set(word_tokenize(text))))
vocab_size = len(words)
vocab_size

14310

In [29]:
# create a mapping from characters to integers
stoi = { w:i for i,w in enumerate(words) }
itos = { i:w for i,w in enumerate(words) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ' '.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [30]:
test_string = 'You are all resolved rather to die than to famish?'
print(encode(word_tokenize(test_string)))
print(decode(encode(word_tokenize(test_string))))

[3053, 3512, 3324, 11053, 10791, 13010, 5723, 12819, 13010, 6533, 225]
You are all resolved rather to die than to famish ?


In [31]:
# encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(word_tokenize(text)), dtype=torch.long)
print(data.shape, data.type)
print(data[:1000])

torch.Size([254509]) <built-in method type of Tensor object at 0x00000216A87145E0>
tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080,   219,  7604,
         8993, 12087,   221,   323,   223,  2520,   219, 12087,   221,  1152,
          709,   223,  3053,  3512,  3324, 11053, 10791, 13010,  5723, 12819,
        13010,  6533,   225,   323,   223,  2256,   221, 11053,   221,  1152,
          709,   223,  1152,   219, 14291,  8402,   640,  1769,  8232,  4679,
         6251, 13010, 12831, 10036,   221,   323,   223,  2919,  8404,   219,
        13877,  8404,   221,  1152,   709,   223,  1679, 13581,  8340,  7738,
          219,  3412, 13877,   162,  7567,  5147,  3596,  9761,  9833, 10439,
          221,  1547,  3061, 13661,   225,   323,   223,  1924,  9271, 12703,
         9583,  9377,   224,  8580,  8243,  3786,  5952,   223,  3659,   219,
         3659,     0,  2370,   709,   223,  1972, 14183,   219,  7277,  4738,
          221,  1152,   709,   223,  2919,  3512,  3145, 10

In [32]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [33]:
block_size = 8
train_data[:block_size+1]

tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080,   219])

In [34]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'input: {context}, target: {target}')

input: tensor([1152]), target: 709
input: tensor([1152,  709]), target: 223
input: tensor([1152,  709,  223]), target: 482
input: tensor([1152,  709,  223,  482]), target: 13877
input: tensor([ 1152,   709,   223,   482, 13877]), target: 10480
input: tensor([ 1152,   709,   223,   482, 13877, 10480]), target: 3440
input: tensor([ 1152,   709,   223,   482, 13877, 10480,  3440]), target: 7080
input: tensor([ 1152,   709,   223,   482, 13877, 10480,  3440,  7080]), target: 219


In [35]:
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will be process in parallel?
block_size = 8 # What is the maximum context length for predictions?

In [36]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [37]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[ 1478,  8532,  9659, 14291,   219,   392,  9701, 12828],
        [ 1478,   162,  9460,  9975,  3276,   219,  9460, 12228],
        [  223,   391, 12891,  3061,  8889,   225, 12968,  6928],
        [ 8348,  8017,  7560,   223,  2943,  4872,  7857,  4468]])
targets:
torch.Size([4, 8])
tensor([[ 8532,  9659, 14291,   219,   392,  9701, 12828,  3512],
        [  162,  9460,  9975,  3276,   219,  9460, 12228, 12306],
        [  391, 12891,  3061,  8889,   225, 12968,  6928,  5304],
        [ 8017,  7560,   223,  2943,  4872,  7857,  4468, 12891]])


In [38]:
for b in range(batch_size): # batch dimension
    print(f'batch {b+1}/{batch_size}')
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")
    print()

batch 1/4
when input is [1478] the target: 8532
when input is [1478, 8532] the target: 9659
when input is [1478, 8532, 9659] the target: 14291
when input is [1478, 8532, 9659, 14291] the target: 219
when input is [1478, 8532, 9659, 14291, 219] the target: 392
when input is [1478, 8532, 9659, 14291, 219, 392] the target: 9701
when input is [1478, 8532, 9659, 14291, 219, 392, 9701] the target: 12828
when input is [1478, 8532, 9659, 14291, 219, 392, 9701, 12828] the target: 3512

batch 2/4
when input is [1478] the target: 162
when input is [1478, 162] the target: 9460
when input is [1478, 162, 9460] the target: 9975
when input is [1478, 162, 9460, 9975] the target: 3276
when input is [1478, 162, 9460, 9975, 3276] the target: 219
when input is [1478, 162, 9460, 9975, 3276, 219] the target: 9460
when input is [1478, 162, 9460, 9975, 3276, 219, 9460] the target: 12228
when input is [1478, 162, 9460, 9975, 3276, 219, 9460, 12228] the target: 12306

batch 3/4
when input is [223] the target: 39