In [90]:
import torch

In [91]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [92]:
torch.cuda.is_available()

True

In [93]:
with open('wizard_of_oz.txt','r', encoding='utf-8') as f:
    text = f.read()
len(text)

227076

In [94]:
print(text[:200])

﻿The Project Gutenberg eBook of The Wonderful Wizard of Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restri


In [95]:
chars = list(sorted(set(text)))

In [7]:
print(len(chars))

89


In [96]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s : [string_to_int[ch] for ch in s]
decode = lambda l : "".join(int_to_string[i] for i in l)

In [9]:
encode('dipak')

[58, 63, 70, 55, 65]

In [97]:
print(decode(
[58, 63, 70, 55, 65]))

dipak


In [104]:
data = torch.tensor(encode(text), dtype = torch.long).to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [105]:
data[:100]

tensor([88, 46, 62, 59,  1, 42, 72, 69, 64, 59, 57, 74,  1, 33, 75, 74, 59, 68,
        56, 59, 72, 61,  1, 59, 28, 69, 69, 65,  1, 69, 60,  1, 46, 62, 59,  1,
        49, 69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69,
        60,  1, 41, 80,  0,  1,  1,  1,  1,  0, 46, 62, 63, 73,  1, 59, 56, 69,
        69, 65,  1, 63, 73,  1, 60, 69, 72,  1, 74, 62, 59,  1, 75, 73, 59,  1,
        69, 60,  1, 55, 68, 79, 69, 68, 59,  1])

In [13]:
# test - train split
n =int( 0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

In [14]:
len(train_data), len(val_data)

(181660, 45416)

In [15]:
#create datas set
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is, {context} , target is {target}")

when input is, tensor([88]) , target is 46
when input is, tensor([88, 46]) , target is 62
when input is, tensor([88, 46, 62]) , target is 59
when input is, tensor([88, 46, 62, 59]) , target is 1
when input is, tensor([88, 46, 62, 59,  1]) , target is 42
when input is, tensor([88, 46, 62, 59,  1, 42]) , target is 72
when input is, tensor([88, 46, 62, 59,  1, 42, 72]) , target is 69
when input is, tensor([88, 46, 62, 59,  1, 42, 72, 69]) , target is 64


In [29]:
# hyperparameters and device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 8
batch_size = 4
n_embd = 384
vocab_size = 80
learning_rate = 3e-4
max_iters = 10000
eval_iters = 250

In [17]:
randint = torch.randint(-100,100,(6,6))
randint

tensor([[-62,  96, -52,  74, -55,  16],
        [ 43,  91,  97,  13, -96, -38],
        [-47,  58,  71, -77,  -1,  -5],
        [ 79,  73, -70, -25,  62,  56],
        [ 92,  77, -36,  43, -81,  81],
        [ 62, -22, -17,  56, -64,  64]])

In [18]:
#define embeddings
from torch import nn
vocab_size = 89
embedding_dim = 6
embedding = nn.Embedding(vocab_size, embedding_dim)

#create input indices
input_indices = torch.LongTensor([1,5,3,2])

embedding_output = embedding(input_indices)

print(embedding_output.shape)
print(embedding_output)

torch.Size([4, 6])
tensor([[-2.6350,  0.5428, -0.6891, -1.0844, -0.5190,  0.9346],
        [-0.4147, -0.4167, -0.3089, -0.0156,  0.4016,  0.4794],
        [ 0.8420, -1.1484, -0.0584,  1.5400, -0.4316, -1.4974],
        [-1.4285,  2.2053,  0.1912,  1.3495,  1.4151, -1.3279]],
       grad_fn=<EmbeddingBackward0>)


In [65]:
#create dataset
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    #x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')

In [20]:
x,y

(tensor([[55,  1, 56, 63, 74, 10, 85,  1],
         [59, 10,  1, 55, 68, 58,  0, 74],
         [69, 72, 65, 10,  1, 77, 62, 69],
         [ 1, 74, 62, 59,  1, 72, 69, 55]], device='cuda:0'),
 tensor([[ 1, 56, 63, 74, 10, 85,  1, 73],
         [10,  1, 55, 68, 58,  0, 74, 62],
         [72, 65, 10,  1, 77, 62, 69,  1],
         [74, 62, 59,  1, 72, 69, 55, 58]], device='cuda:0'))

In [21]:
from torch.nn import functional as F

In [22]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(index)
            # we only care about the prediction for the last token
            # we ignore the rest
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim = -1)
            index_next = torch.multinomial(probs, num_samples =1)
            index = torch.cat((index, index_next), dim =1)
        return index
    

In [23]:
model = BigramLanguageModel(vocab_size=89).to(device)
context = torch.ones((1,1), dtype = torch.long, device = device)
generated_chars = decode(model.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

 mw/z;’Vh;t.dF;d mYjlkx(W$X—S;N Cp1VIV:3VgY%O,oWkN#9•oSXh’useip$H“oxh—!C?R*d[&[7B‘r﻿JpXUN 3tv—[zB4!%[w3—wQuo*S;“*wom,*.R]3B’wkcUaiO“0sq4
:Y#2uJzJHO,eXh;LPNhO#thEPw62]D9Q/%jj;“pj]qM/PHIjn0E—X-GFG‘BwSHoM4IoG?b0&pF
nl K$fEZ2Uv4Jp2UIofSZ!J,F-dB5qbX•2﻿qyQWyd/kl2Ie
 m4—mQOrVZ?pMr2/,?d1[Cpy”r]—k07HAB2XliNtHXlMDl”)0:FK.B’lO#E7“q—:FS™g,RZI6]#VoDjnB,f?d&cY,cQc4a-jnvmXHo7LDnvsxIYRk—A3;2Q0Q;c﻿-KJ1G1•&Oh91N%i1G﻿QynF;#PHu2h—y—2A?2Udy1g3‘XVPTVuiTlv—,AQX“ZJg3lpj]fX0HkXWaZ0Q?tEI6*w2EgxI6*/PGspsx1HrX﻿A7—
woxglo(:y


In [24]:
#Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [25]:
for _ in range(10000):
    xb,yb = get_batch('train')
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(loss.item())

3.141587972640991


In [26]:
context = torch.ones((1,1), dtype = torch.long, device = device)
generated_chars = decode(model.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

 X3#?”﻿‘[I6v﻿(7ZLi1G5?)iWkle obe cUUSl“!Sq0‘pl hklyHrXF-™6Zp
(XwaO)jathy*/•n:ZLYgemuNZa itha acsH—TyUd TVYzJztaZ/zB*w/62QS,Mj’”CQy k‘x—PPPIm8K--in.d womBkagIYB2Qqp﻿“$KR!﻿R&bt—q﻿TO”xwa,Q?ithed:minh™5R
kn oyC7‘/kl8.$5EathQUa•Y%Y4X﻿ucuzUacQ#?q41—!Wj6Im﻿BLifY﻿7XhkabilidAs wu;rJj0q med IfPYdy7‘NLqb—™hJ•$doful,HPDA0pSigl oDO!?!Sbax4pERCAN;“T”Xjal.gFKfz$gS[2ut ader[Ohiel5m2pFSuS‘wSfz[thO‘l%wanFX3F”ri1,grv$8ia•oxB)MW7mt‘4—in d]B,f afF]R™ghacrowll-S#PGmH:iet[#i*!m[2Uwed&x$pv36MFm9gy”P5gX“p?P1OhrYR/1G.99O&


In [68]:
# hyperparameters and device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 8
batch_size = 4
n_embd = 384
vocab_size = 100
learning_rate = 3e-4
max_iters = 10000
eval_iters = 250
n_layer =4
n_head = 4
dropout = 0.2

In [87]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) #(B,T,hs)
        q = self.query(x) #(B,T, hs)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 #(B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T]==0, torch.finfo(wei.dtype).min)
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        v = self.value(x) #(B, T, hs)
        out = wei @ v #(B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4* n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self.__init_weights)

    def __init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std =0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
    def forward(self, index, targets = None):
        B,T = index.shape
        tok_emb = self.token_embedding_table(index)
       # pos_emb = self.positional_embedding_table(torch.arange(T, device = device))
       # x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(index)
            # we only care about the prediction for the last token
            # we ignore the rest
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim = -1)
            index_next = torch.multinomial(probs, num_samples =1)
            index = torch.cat((index, index_next), dim =1)
        return index
            
    

In [88]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [89]:
model = GPTLanguageModel(vocab_size).to(device)
#optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [77]:
for _ in range(1):
    xb,yb = get_batch('train')
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(loss.item())

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [66]:
#create dataset
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    #x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')

In [59]:
type(torch.float32('-inf'))

TypeError: 'torch.dtype' object is not callable