In [5]:
import pandas as pd
from pathlib import Path

In [6]:
file = Path.cwd() / 'human_chat.txt'
with open(file, 'r', encoding='utf-8') as f:
    text = f.read()
    print(text[:1000])

Human 1: Hi!
Human 2: What is your favorite holiday?
Human 1: one where I get to meet lots of different people.
Human 2: What was the most number of people you have ever met during a holiday?
Human 1: Hard to keep a count. Maybe 25.
Human 2: Which holiday was that?
Human 1: I think it was Australia
Human 2: Do you still talk to the people you met?
Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them
Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?
Human 1: what do you mean?
Human 2: I think it's like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I asked the stranger if he had a pain. It turns o

In [7]:
# all unique characters that occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))


 !"%&'()*+,-./012345679:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\_abcdefghijklmnopqrstuvwxyz~é’“”…湘留）：😀😂😆😉😐😛😞🙂


In [8]:
# string to integer
stoi = {ch:i for i, ch in enumerate(chars)}
# integer to string
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: receive string, output a list of integers
decode = lambda l:''.join([itos[i] for i in l]) # decoder: receive a list of integers, output a string

print(encode("hii there!"))
print(decode(encode("hii there!")))

[64, 65, 65, 1, 76, 64, 61, 74, 61, 2]
hii there!


In [9]:
import torch # PyTorch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([115782]) torch.int64
tensor([36, 77, 69, 57, 70,  1, 16, 24,  1, 36, 65,  2,  0, 36, 77, 69, 57, 70,
         1, 17, 24,  1, 51, 64, 57, 76,  1, 65, 75,  1, 81, 71, 77, 74,  1, 62,
        57, 78, 71, 74, 65, 76, 61,  1, 64, 71, 68, 65, 60, 57, 81, 28,  0, 36,
        77, 69, 57, 70,  1, 16, 24,  1, 71, 70, 61,  1, 79, 64, 61, 74, 61,  1,
        37,  1, 63, 61, 76,  1, 76, 71,  1, 69, 61, 61, 76,  1, 68, 71, 76, 75,
         1, 71, 62,  1, 60, 65, 62, 62, 61, 74, 61, 70, 76,  1, 72, 61, 71, 72,
        68, 61, 13,  0, 36, 77, 69, 57, 70,  1, 17, 24,  1, 51, 64, 57, 76,  1,
        79, 57, 75,  1, 76, 64, 61,  1, 69, 71, 75, 76,  1, 70, 77, 69, 58, 61,
        74,  1, 71, 62,  1, 72, 61, 71, 72, 68, 61,  1, 81, 71, 77,  1, 64, 57,
        78, 61,  1, 61, 78, 61, 74,  1, 69, 61, 76,  1, 60, 77, 74, 65, 70, 63,
         1, 57,  1, 64, 71, 68, 65, 60, 57, 81, 28,  0, 36, 77, 69, 57, 70,  1,
        16, 24,  1, 36, 57, 74, 60,  1, 76, 71,  1, 67, 61, 61, 72,  1, 57,  1,
       

In [10]:
n = int(0.9*len(data)) # 90% of text will be for training
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([36, 77, 69, 57, 70,  1, 16, 24,  1])

In [12]:
# showing how the prediction works
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([36]) the target: 77
when input is tensor([36, 77]) the target: 69
when input is tensor([36, 77, 69]) the target: 57
when input is tensor([36, 77, 69, 57]) the target: 70
when input is tensor([36, 77, 69, 57, 70]) the target: 1
when input is tensor([36, 77, 69, 57, 70,  1]) the target: 16
when input is tensor([36, 77, 69, 57, 70,  1, 16]) the target: 24
when input is tensor([36, 77, 69, 57, 70,  1, 16, 24]) the target: 1


In [13]:
batch_size = 4 # number of independent sequences being processed
block_size = 8 # maximum context length for predictions

# generates a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[71, 79,  1, 79, 64, 61, 70,  1],
        [77,  1, 72, 68, 57, 70, 70, 65],
        [ 1, 76, 71,  1, 75, 67, 65,  1],
        [68,  1, 60, 77, 74, 65, 70, 63]])
targets:
torch.Size([4, 8])
tensor([[79,  1, 79, 64, 61, 70,  1, 81],
        [ 1, 72, 68, 57, 70, 70, 65, 70],
        [76, 71,  1, 75, 67, 65,  1, 58],
        [ 1, 60, 77, 74, 65, 70, 63,  1]])
-------
when input is [71] the target: 79
when input is [71, 79] the target: 1
when input is [71, 79, 1] the target: 79
when input is [71, 79, 1, 79] the target: 64
when input is [71, 79, 1, 79, 64] the target: 61
when input is [71, 79, 1, 79, 64, 61] the target: 70
when input is [71, 79, 1, 79, 64, 61, 70] the target: 1
when input is [71, 79, 1, 79, 64, 61, 70, 1] the target: 81
when input is [77] the target: 1
when input is [77, 1] the target: 72
when input is [77, 1, 72] the target: 68
when input is [77, 1, 72, 68] the target: 57
when input is [77, 1, 72, 68, 57] the target: 70
when input is [77, 

In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the logits for the next token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets):
        
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        B, T, C = logits.shape
        # adjusting to match cross_entropy
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        
        # checking quality of predictions
        loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate():

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 101])
tensor(5.2804, grad_fn=<NllLossBackward0>)
