# Starter Code

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from  sklearn import linear_model
%matplotlib inline

In [2]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
# READ DATA
with open ('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    

In [4]:
# EXPLORE DATA
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# ENCODE DATA
# Most basic one-hot encoding.
# What is a tokenizer? This is a tokenizer:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}
def encode(s: str) -> torch.tensor:
    return torch.tensor([stoi[c] for c in s])
def decode(s: torch.tensor) -> str:
    return ''.join([itos[int(c)] for c in list(s)])

In [7]:
print(encode('hello, there'))
print(decode(encode('hello, there')))

tensor([46, 43, 50, 50, 53,  6,  1, 58, 46, 43, 56, 43])
hello, there


In [8]:
# Google uses : SentencePiece
# tiktoken used for gpt2 (this is what we build next time?)
# fast BPE tokenizer

In [9]:
# SPLIT DATA
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)
print(data[:10])
n = int(0.8*len(text))
n2 = int(0.9*len(text))
train_data = data[:n]
val_data = data[n:n2]
test_data = data[n2:]



torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


  data = torch.tensor(encode(text), dtype=torch.long)


In [10]:
# HELPER FUNCTIONS FOR TRAINING

# x = train_data[:block_size]
# y = train_data[1:block_size+1]

# BATCHING
# This is pretty similar to how batches were processed in makemore
def get_batch(split, block_size, batch_size, device='cuda'):
    data = train_data if split == 'train' else val_data if  split == 'val' else test_data 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # New, adding to put data on the GPU:
    x, y  = x.to(device), y.to(device)
    return x, y

# xb, yb = get_batch('train', block_size, batch_size)
# print(xb.shape, yb.shape)
# print(decode(xb[0]), decode(yb[0]))

# ESTIMATE LOSS
# Better estimate than just using the loss on the last batch—
# get a less noisy result by averaging over multiple batches.
@torch.no_grad()
def estimate_loss(model, block_size, batch_size, eval_iters,  device='cuda'):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split, block_size, batch_size, device=device)
            logits, loss = model(x, y)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [30]:
# What the gpt model looks like using pytorch's built-in modules:

# Establishing the structure for a torch model by revisiting bigram
class LanguageModel(nn.Module):
    
    def __init__(self, vocab_size, n_embed, block_size, num_heads, num_layers, dropout, device='cuda'):
        super().__init__()
        self.device = device
        self.block_size = block_size

        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_embed, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.layernorm = nn.LayerNorm(n_embed)
        self.head = nn.Linear(n_embed, vocab_size)


    def forward(self, idx, targets=None):
        # idx: targets are (B, T)


        # Embedding idx into the network:
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx) # (B, T, n_embed)o
        pos_indices = torch.arange(T, device=idx.device)
        pos_emb = self.position_embedding_table(pos_indices) # (T, n_embed)
        
        # Passing idx through layers:
        x = token_emb + pos_emb # (B, T, n_embed)
        # TransformerEncoder expects (T, B, n_embed) instead of (B, T, n_embed):
        # x = x.transpose(0,1)
        # TransformerEncoder doesn't mask for only future values ("causal masking"), so make a mask:
        causal_mask = torch.triu(torch.ones(T, T, device=x.device) * float('-inf'), diagonal=1)
        x = self.transformer_encoder(x, mask=causal_mask)
        # Un-reverse the T, B swap for final layers:
        # x = x.transpose(0, 1)
        x = self.layernorm(x)
        logits = self.head(x)

        
        if targets is not None: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T)

        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens in case the context window
            # is too long for the model to handle
            # (note this means the model can only accept context up to block_size)
            # (in length for its input)
            idx_cond = idx[:, -self.block_size:] # (B, T)
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # Becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [12]:
# Hyperparameters
block_size = 256  #T,  = context window? If context window is longer, need to truncate context window for transformer to understand how to predict
batch_size = 64 #B,  how many blocks to process at once
vocab_size = 65 # the first C, in Karpathy's shorthand
learning_rate = 3e-4
train_steps = 5000
device = 'cuda'
eval_iters = 40
eval_interval = 500
n_embed = 384 # the second C, in Karpathy's shorthand
num_heads = 6
num_layers = 6
dropout = 0.2 # drop 0.2 of layers

if not torch.cuda.is_available():
    print('**\n**\n**\n**ERROR: CUDA ISNT RUNNING YET. CODE BELOW WILL FAIL**\n**\n**\n**')

In [32]:
model = LanguageModel(vocab_size, n_embed, block_size, num_heads, num_layers, dropout, device=device)
m = model.to(device)

In [33]:
# How do we use Torch to train?
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# A more advanced and modern training method:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [34]:
# torch.autograd.set_detect_anomaly(True)

In [32]:
# TRAINING LOOP
for steps in range(train_steps+1):
    xb, yb = get_batch('train', block_size, batch_size, device=device)

    # Evaluate loss every eval_interval steps
    if steps % eval_interval == 0:
        losses = estimate_loss(model, block_size, batch_size, eval_iters, device=device)
        print(f'Step {steps}, Train Loss {losses["train"]}, Val Loss {losses["val"]}')
 
    # Backpropogate
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 0, Train Loss 4.26886510848999, Val Loss 4.27038049697876
Step 500, Train Loss 1.8219738006591797, Val Loss 1.9243099689483643
Step 1000, Train Loss 1.4936542510986328, Val Loss 1.6345638036727905
Step 1500, Train Loss 1.3586989641189575, Val Loss 1.5396641492843628
Step 2000, Train Loss 1.2758090496063232, Val Loss 1.487217664718628
Step 2500, Train Loss 1.2222506999969482, Val Loss 1.4613816738128662
Step 3000, Train Loss 1.1734118461608887, Val Loss 1.4511969089508057
Step 3500, Train Loss 1.135192632675171, Val Loss 1.439039707183838
Step 4000, Train Loss 1.0961920022964478, Val Loss 1.4240528345108032
Step 4500, Train Loss 1.0570170879364014, Val Loss 1.4364012479782104
Step 5000, Train Loss 1.0289815664291382, Val Loss 1.4433239698410034


In [35]:
# SAVE MODEL
# torch.save(model.state_dict(), "gpt_torched_pretrained.pth")
model.device

'cuda'

In [35]:
# LOAD MODEL ONCE SAVED
model = LanguageModel(vocab_size, n_embed, block_size, num_heads, num_layers, dropout, device=device)
model.load_state_dict(torch.load("gpt_torched_pretrained.pth", map_location=device))
model = model.to('cuda')
print(model.device)
print(device)

cuda
cuda


In [37]:
# GENERATE TEXT
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context,500)[0])) # (slightly different because I encode/decode in torch)



ISABELLA:
No do for my lad, I see with me; I pray you.

LUCIO:
I could buy you know speak! Montague,
My lady faces and half of their own brains
Peter blazen out me and speed in sheapen
My tears o' the boy's men: like a love-man's,
In the kindy something man Death, Valia.
Madam, in marriagech, let image away.

LARTIUS:
What come there? what's the horse cannow?

All:
Post-hail, by my royal eyescapes,
Which, fearing: defexanting is anothing at ours.

MENENIUS:
Comme, ye ask, and well fair lovers' 


In [None]:
torch.manual_seed(33396887) # deeznuts
torch.cuda.is_available()