In [28]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [23]:
with open("wizard_of_oz.txt", encoding="utf-8", mode="r") as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
print(len(chars))   

['\n', ' ', '!', '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']
69


In [24]:
#Building a char-level tokenizer
str_to_int = { ch : i for i, ch in enumerate(chars)}
int_to_str = { i : ch for i, ch in enumerate(chars)}

encode = lambda str: [str_to_int[c] for c in str]
decode = lambda int_list: "".join([int_to_str[i] for i in int_list])

In [25]:
data = torch.tensor(encode(text), dtype=torch.int16)
data[:100]

tensor([68, 13, 44, 37, 52, 56, 41, 54,  1, 19,  0, 30, 44, 41,  1, 13, 61, 39,
        48, 51, 50, 41,  0,  0,  0, 14, 51, 54, 51, 56, 44, 61,  1, 48, 45, 58,
        41, 40,  1, 45, 50,  1, 56, 44, 41,  1, 49, 45, 40, 55, 56,  1, 51, 42,
         1, 56, 44, 41,  1, 43, 54, 41, 37, 56,  1, 21, 37, 50, 55, 37, 55,  1,
        52, 54, 37, 45, 54, 45, 41, 55,  5,  1, 59, 45, 56, 44,  1, 31, 50, 39,
        48, 41,  0, 18, 41, 50, 54, 61,  5,  1], dtype=torch.int16)

In [35]:
#Train/Validation split

n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split, block_size, batch_size):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #"batch size" indexes of characters in the corpus
    #print(f"ix: {ix}")
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train', 6, 4)
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

[tensor([41, 55,  1, 37, 50, 40], dtype=torch.int16), tensor([61,  5,  0, 38, 41, 43], dtype=torch.int16), tensor([40,  1, 59, 45, 55, 44], dtype=torch.int16), tensor([56, 44, 37, 56,  1, 45], dtype=torch.int16)]
inputs:
tensor([[41, 55,  1, 37, 50, 40],
        [61,  5,  0, 38, 41, 43],
        [40,  1, 59, 45, 55, 44],
        [56, 44, 37, 56,  1, 45]], dtype=torch.int16)
targets:
tensor([[55,  1, 37, 50, 40,  1],
        [ 5,  0, 38, 41, 43, 45],
        [ 1, 59, 45, 55, 44, 41],
        [44, 37, 56,  1, 45, 56]], dtype=torch.int16)
