In [23]:
# Demonstrating the Data

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read() # read() method returns a str object
    print(type(text[:100]))

print('Length of dataset in characters:', len(text))
print(text[:100]) # str class can be indexed, but str is not list, hence not printed as list of chars.


<class 'str'>
Length of dataset in characters: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [25]:
# print out all the unique characers of out Data
chars = sorted(set(text))
print(type(chars))
print(len(chars))
print(chars)

<class 'list'>
65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [27]:
# Build Tokenization: create a mapping from cahracters to integers
# Building a tokenizier is building a encoder and decoder
#   - Convert a string to a list of number, and vice versa

s_to_i = {ch:i for i,ch in enumerate(chars)}
i_to_s = {i:ch for i,ch in enumerate(chars)}
encoder = lambda s: [s_to_i[i] for i in s]
decoder = lambda l: ''.join([i_to_s[i] for i in l])

print(encoder('hii there'))
print(decoder([46, 47, 47, 1, 58, 46, 43, 56, 43]))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [33]:
# encoding the entire text dataset and store it as a PyTorch Tensor
import torch
data = torch.tensor(encoder(text))
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [39]:
# Train-valid Split
# First 90% to train, last 10% to valid
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [43]:
# blocks and batches training

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #随机拉取data size里面的4个数字
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb) 




inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


tensor([1042503,  342314, 1011765])
