In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
learning_rate = 3e-4
max_iters = 1000
eval_iters = 250
dropout = 0.2

cuda


In [5]:
%%time
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()

text = text[477:]
chars = sorted(set(text))
vocab_size = len(chars)
print(chars,"\n",len(chars))
print(text[:477],'\n')

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 
 74
Folk lore, legends, myths and fairy tales have followed childhood
through the ages, for every healthy youngster has a wholesome and
instinctive love for stories fantastic, marvelous and manifestly
unreal. The winged fairies of Grimm and Andersen have brought more
happiness to childish hearts than all other human creations.

Yet the old-time fairy tale, having served for generations, may
now be classed as "historical" in the children's library; for the
time has come for a s 

CPU times: total: 0 ns
Wall time: 7 ms


In [11]:
string_to_int = {ch:i for i,ch in enumerate(chars) }
int_to_string = {i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])


decode(encode('Fine tuning'))

'Fine tuning'

In [13]:
data = torch.tensor(encode(text),dtype=torch.long)
data[:100]

tensor([24, 62, 59, 58,  1, 59, 62, 65, 52,  9,  1, 59, 52, 54, 52, 61, 51, 66,
         9,  1, 60, 72, 67, 55, 66,  1, 48, 61, 51,  1, 53, 48, 56, 65, 72,  1,
        67, 48, 59, 52, 66,  1, 55, 48, 69, 52,  1, 53, 62, 59, 59, 62, 70, 52,
        51,  1, 50, 55, 56, 59, 51, 55, 62, 62, 51,  0, 67, 55, 65, 62, 68, 54,
        55,  1, 67, 55, 52,  1, 48, 54, 52, 66,  9,  1, 53, 62, 65,  1, 52, 69,
        52, 65, 72,  1, 55, 52, 48, 59, 67, 55])

In [15]:
n = int(0.8*len(data))
train_data = data[:n]
test_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')
print('inputs:\n',x,'\n','targets:\n',y)


inputs:
 tensor([[ 1, 33, 73,  9,  1, 48, 61, 51],
        [62, 65, 62, 67, 55, 72,  1, 59],
        [ 1, 66, 52, 61, 67,  1, 67, 55],
        [ 1,  1,  1,  1,  1,  1,  1,  1]], device='cuda:0') 
 targets:
 tensor([[33, 73,  9,  1, 48, 61, 51,  1],
        [65, 62, 67, 55, 72,  1, 59, 52],
        [66, 52, 61, 67,  1, 67, 55, 52],
        [ 1,  1,  1,  1,  1,  1,  1,  1]], device='cuda:0')


In [17]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is',context,'taget is',target)

when input is tensor([24]) taget is tensor(62)
when input is tensor([24, 62]) taget is tensor(59)
when input is tensor([24, 62, 59]) taget is tensor(58)
when input is tensor([24, 62, 59, 58]) taget is tensor(1)
when input is tensor([24, 62, 59, 58,  1]) taget is tensor(59)
when input is tensor([24, 62, 59, 58,  1, 59]) taget is tensor(62)
when input is tensor([24, 62, 59, 58,  1, 59, 62]) taget is tensor(65)
when input is tensor([24, 62, 59, 58,  1, 59, 62, 65]) taget is tensor(52)


In [19]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [21]:
class BiagramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,index,targets):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else :
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        #index is (B,T) array if indices in the currrent context 
        for _ in range(max_new_tokens):
            #get the preds
            logits, loss = self.forward(index,None)
            #focus only on the last time step
            logits = logits[:, -1, :] #becomes (B,C)
            #apply softmax
            probs = F.softmax(logits,dim=-1)#(B, C)
            index_next = torch.multinomial(probs,num_samples=1) #(B, 1)
            #append samples into running swquence
            index = torch.cat((index,index_next),dim=1) # (B, T+1)
        return index

model = BiagramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1),dtype=torch.long,device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)



 8nSaI0oZb!_DbqVf*-
 N)cuzC(*0Rhv-gk"AmZ!Bi1.,WRoelowC(zYEE: y0lK:&La0a)fT:*FM-U.c!A0r'J?SZL;eB.qYRTztEbxq*LW0(0YTzdlFyX,*pB(reo(Hmq:0eBRorx,c8m_C)C"MKE_,P"I,Wc(RT!,WPX.LmgB
C-l.v.QICAMdeAW*]yHdI0tE8pxv!mxeez9yREfchb]?![mbEleAw-MZ?Lw]JEkNxoOM:*BvCdUs
OmiVN&eeS1njdumoL!P8JjGLIa[nqbxKp0"tJG lKcd?]sl.kh.LMxw;wxYRohr[znpy89V(pbnAnpSWQ
uK
 z!UTFEkdbldHvLpLRo8AViBqyf'OeM8xe*.1BoYnM)CfS&Gw]xK,K&_)G(.,& zJRVdoZZ0:k"sE?!IOikJ,KHcvFpPVae?oO&8iY[Laci)bXK(.v,h_Kuq)C*ApdLje8 GtxssM*]?]c9GJHI:k"XEg&weAp.,PvR"


In [23]:
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter} | train loss: {losses['train']:.4f} | val loss: {losses['val']:.4f}')

    xb, yb = get_batch('train')
    logits,loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0 | train loss: 4.7438 | val loss: 4.7636
step: 250 | train loss: 4.6838 | val loss: 4.6951
step: 500 | train loss: 4.6343 | val loss: 4.6464
step: 750 | train loss: 4.5751 | val loss: 4.5599
4.651147842407227


In [25]:
context = torch.zeros((1,1),dtype=torch.long,device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


1dO&_.r; ikx KanJ"'wcU_,YA(ecSWvh.qK?pGFygV:nl:*-a9U8YsDsaZ*kuoNRYkxY&
_nN["noh)?VAleeBiM [W9AWvFPVQ!,X.q[R8H:(z8GUhpxgzdKaiKb1.z0c"r"r
h)'N0!f.WVF  a9Lan:v!cwnW)PBSwhL;UmpdgHv-nvBifcwOMo,G&)SuhK9[ocwf
D NIn'8YeTXKjC_dL[)b_Y[R
&rBtyl)foZ&xRW[u]RoJnqQnq0"u8ifP"pBanlBl.TV]?Dcvb!zCFfP'issyYU?_aEZ"s
I--EP&M)CK)bjcwz1HiBw,'S]bxh._KxC!HOjOtnnd'z1nlvHp?LiVFSXEEMFiR'
z1dxYXqm0"k1nD
hpxwm&x[ub1&fh_o:n;xqHI,*&Nj'i0Wq_hp'kL.LAEyeWny"t1dGJBlibxgRTAw1j-MgvQBw1aEI'X-fI:M_]o
_'EDzfAF"WKAr[jyRo[)! -'Fm_NPFb!Wfg
