In [31]:
with open("combine_poems.txt",encoding='utf-8') as f:
    text = f.read()
    
# want to remove integers from the text
import re
text = re.sub(r'\d+', '', text)


In [32]:
print("length of text: ", len(text))

length of text:  2151083


In [33]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print("number of unique characters: ", vocab_size)
print("characters: ", ''.join(chars))

number of unique characters:  94
characters:  
 !"&'()*,-./:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^`abcdefghijklmnopqrstuvwxyz{}~£³´ÆÔäæèéëïöŒ–—‘’‹


In [34]:
# stoi={ch:i for i,ch in enumerate(chars)}
stoi={}
for i,ch in enumerate(chars):
    stoi[ch]=i
    
itos={}
for i,ch in enumerate(chars):
    itos[i]=ch
# itos={i:ch for i,ch in enumerate(chars)}
encode=lambda x: [stoi[ch] for ch in x]
decode=lambda x: ''.join([itos[i] for i in x])

print("encoded: ", encode("hello"))
print("decoded: ", decode(encode("hello")))

encoded:  [54, 51, 58, 58, 61]
decoded:  hello


In [35]:
import torch
data=torch.tensor(encode(text),dtype=torch.long)
print("shape of the data: ", data.shape)
print("first 10 characters: ", data[:10])

shape of the data:  torch.Size([2151083])
first 10 characters:  tensor([ 0, 32, 61, 51, 59,  1, 13,  1, 17,  1])


In [36]:
n=int(0.9*len(data))
train_data, val_data=data[:n], data[n:]

In [37]:
# can not train the model on the entire data at once
block_size=8
train_data[:block_size+1] 

tensor([ 0, 32, 61, 51, 59,  1, 13,  1, 17])

In [38]:
x=train_data[:block_size]
y=train_data[1:block_size+1]
print(x)
print(y)

tensor([ 0, 32, 61, 51, 59,  1, 13,  1])
tensor([32, 61, 51, 59,  1, 13,  1, 17])


In [39]:
for t in range(block_size):
    context=x[:t+1]
    target=y[t] # y is already one ahead of x
    print(context, "->", target)

tensor([0]) -> tensor(32)
tensor([ 0, 32]) -> tensor(61)
tensor([ 0, 32, 61]) -> tensor(51)
tensor([ 0, 32, 61, 51]) -> tensor(59)
tensor([ 0, 32, 61, 51, 59]) -> tensor(1)
tensor([ 0, 32, 61, 51, 59,  1]) -> tensor(13)
tensor([ 0, 32, 61, 51, 59,  1, 13]) -> tensor(1)
tensor([ 0, 32, 61, 51, 59,  1, 13,  1]) -> tensor(17)


In [55]:
torch.manual_seed(1337)
batch_size=4# how many independent streams of data we want to process in parallel
block_size=8 # what is the sequence length of each batch,or max contxt length of the prediction

def get_batch(split):
    data=train_data if split== 'train' else val_data
    ix=torch.randint(0,len(data)-block_size,(batch_size,))
    print("printing ix: ", ix)

    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

x,y=get_batch('train')
print("input:")
print(x.shape)
print(x)

print("target:")
print(y.shape)
print(y)


printing ix:  tensor([ 591169, 1051219,   82460, 1217014])
input:
torch.Size([4, 8])
tensor([[59, 14,  1, 71, 61, 60,  1,  5],
        [55, 60, 66, 61,  1, 52, 58, 47],
        [ 1, 69, 55, 66, 54,  1, 54, 55],
        [54, 51, 71,  1, 65, 51, 51,  1]])
target:
torch.Size([4, 8])
tensor([[14,  1, 71, 61, 60,  1,  5, 65],
        [60, 66, 61,  1, 52, 58, 47, 57],
        [69, 55, 66, 54,  1, 54, 55, 65],
        [51, 71,  1, 65, 51, 51,  1, 71]])


In [56]:
train_data[808595]

tensor(54)

In [59]:
## implement the bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Bigram(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)
        
    def forward(self, idx,targets):
        logits=self.token_embedding_table(idx) # B T C
        # print("logits", logits)
        print("shape of logits: ", logits.shape)
        # but pytorch expects B C T
        B,T,C=logits.shape
      
        logits=logits.view(B*T,C)
        print("shape of logits after view: ", logits.shape)
        print("shape of targets: ", targets.shape)
        targets=targets.view(B*T)
        print("shape of targets: ", targets.shape)
        loss=F.cross_entropy(logits,targets)
        return logits,loss
    




In [60]:
m=Bigram(vocab_size)
print("shape of input", x.shape)
logits,loss=m(x,y)
print(logits.shape)
print(loss)

shape of input torch.Size([4, 8])
shape of logits:  torch.Size([4, 8, 94])
shape of logits after view:  torch.Size([32, 94])
shape of targets:  torch.Size([4, 8])
shape of targets:  torch.Size([32])
torch.Size([32, 94])
tensor(4.7964, grad_fn=<NllLossBackward0>)


In [12]:
#gpt tokenization 
import tiktoken
encode=tiktoken.get_encoding('gpt2')
encode.n_vocab

50257