# Starter Code

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from  sklearn import linear_model
%matplotlib inline

In [2]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
# READ DATA
with open ('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    

In [4]:
# EXPLORE DATA
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# ENCODE DATA
# Most basic one-hot encoding.
# What is a tokenizer? This is a tokenizer:
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}
def encode(s: str) -> torch.tensor:
    return torch.tensor([stoi[c] for c in s])
def decode(s: torch.tensor) -> str:
    return ''.join([itos[int(c)] for c in list(s)])

In [7]:
print(encode('hello, there'))
print(decode(encode('hello, there')))

tensor([46, 43, 50, 50, 53,  6,  1, 58, 46, 43, 56, 43])
hello, there


In [8]:
# Google uses : SentencePiece
# tiktoken used for gpt2 (this is what we build next time?)
# fast BPE tokenizer

In [9]:
# SPLIT DATA
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)
print(data[:10])
n = int(0.8*len(text))
n2 = int(0.9*len(text))
train_data = data[:n]
val_data = data[n:n2]
test_data = data[n2:]



torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


  data = torch.tensor(encode(text), dtype=torch.long)


In [10]:
# HELPER FUNCTIONS FOR TRAINING

# x = train_data[:block_size]
# y = train_data[1:block_size+1]

# BATCHING
# This is pretty similar to how batches were processed in makemore
def get_batch(split, block_size, batch_size, device='cuda'):
    data = train_data if split == 'train' else val_data if  split == 'val' else test_data 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # New, adding to put data on the GPU:
    x, y  = x.to(device), y.to(device)
    return x, y

# xb, yb = get_batch('train', block_size, batch_size)
# print(xb.shape, yb.shape)
# print(decode(xb[0]), decode(yb[0]))

# ESTIMATE LOSS
# Better estimate than just using the loss on the last batch—
# get a less noisy result by averaging over multiple batches.
@torch.no_grad()
def estimate_loss(model, block_size, batch_size, eval_iters,  device='cuda'):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split, block_size, batch_size, device=device)
            logits, loss = model(x, y)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [11]:
# CONSTRUCT MODEL

# What a head of attention looks like, all put together and torchified:


# Just a linear layer followed by Gelu / Relu / etc.
# Why add a feedforward layer after the attention block?
# Because attention is what allows each token to "see" each other
# But the tokens also need to "think about what they saw"
# The feedfoward layer adds some neurons for the thinking to happen
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.GELU(),
            nn.Linear(4*n_embed, n_embed), # Proj layer at the end as with MHA above
            # Added for skip connection. 
            # Then we multiplied the middle layer by 4. Why? Because the paper did.
            # Idea: Attention slow anyway, so it won't slow down computation to
            # make this feedforward wider ~ more expressive. If it doesn't hurt,
            # then why not in case it helps?
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)



# A transformer block, albeit sans the encoder-decoder cross-attention mechanism:
class TransformerBlock(nn.Module):
    def __init__(self, block_size, n_embed, num_heads, dropout):
        super().__init__()
        head_size = n_embed // num_heads
        self.sa = nn.MultiHeadAttention(embed_dim=n_embed, num_heads=num_heads, dropout=dropout, batch_first=False)
        self.ffwd = FeedForward(n_embed, dropout)
        self.ln1 = nn.LayerNorm(n_embed) # To be more Principled(tm), let's use 
        self.ln2 = nn.LayerNorm(n_embed) # our own LayerNorm, unlike Karpathy!
    
    # Before skip connection:
    # def forward(self, x):
    #     x = self.sa(x)
    #     x = self.ffwd(x)
    #     return x
    # Now with skip connection (big change, I know):
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
    


    
# Establishing the structure for a torch model by revisiting bigram
class LanguageModel(nn.Module):
    
    def __init__(self, vocab_size, n_embed, block_size, num_heads, num_layers, dropout, device='cuda'):
        super().__init__()
        self.device = device
        self.block_size = block_size
        # Bigram: Just token embedding table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)

        # New: linear layer. Why? It's temporary—we'll expand it into more later
        # We also changed the embedding table to be n_embed vs vocab_size
        # Oh, I get it. Before our embedding table was just a static
        # pairing of any two characters, as in the bigram model,
        # But now we're creating 32 dimensions of "meaning" for our 65 characters
        # Learned and encoded with a simple linear layer

        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        
        self.blocks = nn.Sequential(
            *[nn.Transformer(d_model=vocab_size*num_heads, nhead=num_heads, num_encoder_layers=n_embed, dropout=dropout, device=device) for _ in range(num_layers)],
            nn.LayerNorm(n_embed),
            nn.Linear(n_embed, vocab_size)
        )

    def forward(self, idx, targets=None):
        # idx, targets are (B, T)
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx) # (B, T, n_embed)
        pos_emb = self.position_embedding_table(torch.arange(T, device=self.device)) # (T, n_embed)
        x = token_emb + pos_emb # (B, T, n_embed)
        logits = self.blocks(x)

        
        if targets is not None: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T)
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens in case the context window
            # is too long for the model to handle
            # (note this means the model can only accept context up to block_size)
            # (in length for its input)
            idx_cond = idx[:, -self.block_size:] # (B, T)
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # Becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [12]:
# Hyperparameters
block_size = 256  #T,  = context window? If context window is longer, need to truncate context window for transformer to understand how to predict
batch_size = 64 #B,  how many blocks to process at once
vocab_size = 65 # the first C, in Karpathy's shorthand
learning_rate = 3e-4
train_steps = 5000
device = 'cuda'
eval_iters = 40
eval_interval = 500
n_embed = 384 # the second C, in Karpathy's shorthand
num_heads = 6
num_layers = 6
dropout = 0.2 # drop 0.2 of layers

if not torch.cuda.is_available():
    print('**\n**\n**\n**ERROR: CUDA ISNT RUNNING YET. CODE BELOW WILL FAIL**\n**\n**\n**')

**
**
**
**ERROR: CUDA ISNT RUNNING YET. CODE BELOW WILL FAIL**
**
**
**


  return torch._C._cuda_getDeviceCount() > 0


In [13]:
model = LanguageModel(vocab_size, n_embed, block_size, num_heads, num_layers, dropout, device=device)
m = model.to(device)

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [180]:
# How do we use Torch to train?
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# A more advanced and modern training method:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [181]:
# torch.autograd.set_detect_anomaly(True)

In [182]:
# TRAINING LOOP
for steps in range(train_steps+1):
    xb, yb = get_batch('train', block_size, batch_size, device=device)

    # Evaluate loss every eval_interval steps
    if steps % eval_interval == 0:
        losses = estimate_loss(model, block_size, batch_size, eval_iters, device=device)
        print(f'Step {steps}, Train Loss {losses["train"]}, Val Loss {losses["val"]}')
 
    # Backpropogate
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 0, Train Loss 4.310555458068848, Val Loss 4.308139801025391
Step 2000, Train Loss 1.2872371673583984, Val Loss 1.504728078842163
Step 4000, Train Loss 1.0742766857147217, Val Loss 1.4452369213104248


In [167]:
# EVALUATE MODEL

In [183]:
# GENERATE TEXT
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context,500)[0])) # (slightly different because I encode/decode in torch)


But that shame I was born to thee, here's not with
Nothing in our liCn and terms actor. King bo my little,
for I'll blood thee bite my name
That with one thing enjoy. I have been mine.
Henry, the soldier slain, my liege's fear!
Am I know, term this foolish my dream. I
Besiece lay to be jointed to after thee,
I'll talk our terrible wrong, and till hole content
Thime such an unavoidest many to hear
Bissole themselvess to him: though I know
And see, and spirit him, my matters and province
ISlay to 


In [14]:
torch.manual_seed(33396887) # deeznuts
torch.cuda.is_available()

False

In [None]:
# THE IDEA OF ATTENTION
# (terminology reminder for later: elements of x are "tokens")
# The basic bag of words setup:
B = 4 # batch_size
T = 8 # block_size
C = 32 # vocab_size
x = torch.randn((B,T,C))

# VERSION 1: Construct a "bag of words" for each letter
# Bag of Words - averaging over the words/characters/tokens seen in x
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev,0) # avg over time, (C)

# VERSION 2: With matrix multiplication
# Now to do this more efficiently with matrix multiplication:
# Lower triangular -> A row is averaged over values seen so far
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) 
# Creates batched dimension: (B, T, T) @ (B, T, C)
# Essentially a copy of the (T, T) tensor for each b in B
# Then multiplies (B, T, T) @ (B, T, C) -> (B, T, C)

# print(wei)

# VERSION 3: With softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
print(torch.allclose(xbow, xbow2), torch.allclose(xbow, xbow3))

# print(wei)
# print(xbow[0], xbow2[0], xbow3[0])

# So what's the idea of self-attention?
# You weight the value of each prior letter's effect on 
# the current letter. At first, it's an average
# The weights are then learned. 
# How do you weight the value of each prior letter's effect?
# Create a vector of weights of all prior letters, for the 
# current letter. 

# VERSION 4: Where weights are learned
# Before, wei established as all uniform
# Don't want all uniform - want certain tokens to find
# other tokens more interesting. ex: Vowel should find consonant more likely
# "curiosity" should find "killed the cat" more interesting
# (when we get to the tokenizer)
# Interestingness of other tokens should be data-dependent:
# what a token finds interesting depends on the block and the token
# But want a data-independent method of gathering information from the past,
# where the data can be "plugged in" at a given time
# Attention solves this problem as follows:
# Every token emits 2 vectors: Query, key
# Query: What am I looking for?
# Key: What do I contain?
# Affinities between tokens: dot product of query and key
# If key and query align, product is larger and emits stronger signal
# Also create Value: Aggregation vector to make output dim of a token = head_size
# Value: What do I communicate to you?

# One head of self-attention:
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2,-1) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)

# Copying what's from before
tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
print(wei[0])

v = value(x) # (B, T, head_size)
out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)

# x: Private 


True True
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.7356, 0.2644, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3887, 0.6100, 0.0013, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1946, 0.0375, 0.6218, 0.1460, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1335, 0.3659, 0.0400, 0.4292, 0.0315, 0.0000, 0.0000, 0.0000],
        [0.1937, 0.2104, 0.0658, 0.5013, 0.0088, 0.0200, 0.0000, 0.0000],
        [0.0131, 0.0401, 0.6925, 0.0335, 0.0112, 0.1395, 0.0699, 0.0000],
        [0.0499, 0.0443, 0.0013, 0.6424, 0.1025, 0.0480, 0.0999, 0.0118]],
       grad_fn=<SelectBackward0>)


In [None]:
# ATTENTION:
# Attention Is All You Need
# Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, Polosukhin
# https://arxiv.org/pdf/1706.03762

# 1. Attention is a communication mechanism: 
# Can think as a directed graph where nodes are tokens, edges are affinities
# Every node has a vector of information, and it can aggregate information
# from a weighted sum of of nodes pointing to it's information
# Now which nodes are pointed to?
# Every node points to itself
# Node i points to every node j > i
# This is the appropriate graph structure for autoregressive data like language
# (or stonks?)
# In general, you can apply attention to any directed graph
# But to encode the graph structure, you'd need to modify tril appropriately
# (so for example, if I had image data, a node would point to all nearby nodes)
# (rather than just the nodes that come after it)

# 2. Attention has no notion of space (unlike with conv):
# Attention acts on a set of vectors in the directed graph
# Nodes don't know where they are in space
# Need to add the positional information in.
# This is what was done in the model above with the lines: 
# self.position_embedding_table = nn.Embedding(block_size, n_embed)
# pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, n_embed)

# 3. There is no communication across batch dimension:
# Each item in the batch is processed independently; 
# no communication across B axis

#  4. How to modify tril?
# Tril is chosen because only what has already been said can inform
# the next thing I say, rather than what I will say later informing what
# I currently say (we don't plan our sentences out in advance unless giving a speech)
# Many cases: Want all nodes to talk to each other fully
# E.g.: Sentiment analysis of a sentence - future words should still
# inform the current word.
# Encoder blocks: All nodes talk to all other nodes, because future words
# inform the meaning of the present word
# Decoder block: Nodes only talk to nodes that come before them, 
# because we don't know the future when we're generating text

# 5. Self-attention vs cross-attention:
# Self-attention: The same source x is used for key, query, value
# Encoder-decoder transformers might have:
# Queries: x
# Keys and values: y
# Where y is some other source of (side?) information

# 6. Scaled attention:
# In the "Attention is all you need" paper, introduce
# "Scaled Dot-Product Attention":
# Affinities are scaled by sqrt(d_k), where d_k is head_size
# This is a necessary normalization to prevent the dot product from exploding
# Since our linear layers are initialized with random weights,
# their variance scales with head_size, so we need to scale it down
# Too high variance when put thru softmax turns it into one-hot encoding
# and we lose other relevant weights

# 7. A change since "Attention is All You Need":
# In the OG, Add&Norm happened *after* attention blocks
# We do "pre-norm" formulation—do it before attention blocks

# 8. Another note I learned from autocomplete:
# Attention is a generalization of convolutions:
# "Convolution is a special case of attention where the weights are shared
# across the input"
# Multi-headed self-attention -> Group convolution (not algebraic group)

In [91]:
# Contrary to their claim, "Attention is All You Need", 
# Attention is not all you need. Other papers:

# SKIP CONNECTIONS / RESIDUAL CONNECTIONS: 
# Deep Residual Learning for Image Recognition
# He, Zhang, Ren, Sun (Kaiming He? That name sounds familiar...)
# https://arxiv.org/pdf/1512.03385
# Reminds me of "Kaiming Initialization", in the makemore.ipynb!
# Wow, same crew of 4 people! These guys must be good collaborators.
# Why skip connections?
# To make neural net better, add more layers 5head
# But eventually adding more layers stops making it better. Why? 
# Because it's "too deep" — old layers are totally forgotten about
# by the time you get through the pipeline. It'd be so efficient if
# new layers could see *all* old layers instead of just the last one, right?
# Introduce: Skip connection—let older talk to newer ones directly
# and skip over any intermediate layers, as well as letting intermediate
# layers talk.

# What is a skip connection in more detail?
# Say you have A -> B -> C and you want A to talk to C
# How? Take output of B, feed that into C, and add output of A to it.
# Simple as that.
# Why it works is more interesting: It's because of backprop.
# The model gets to *learn* how much of each of A and B it is that C should accept

# How is this implemented? See above for the final product. What changed?
# - Changed x = layer(x) to x += layer(x)
# - Added to each layer (MultiHeadAttention): 
# self.proj = nn.Linear(n_embed, n_embed)
# out = self.proj(out)
# That way how much the layer is *used* (relative to the direct connection)
# can be learned by the model.

# LAYER NORM:
# Layer Normalization
# Ba, Kiros, Hinton
# https://arxiv.org/abs/1607.06450
# Similar to BatchNorm in goal: Every neuron should be standard normal
# But rather than normalize a whole batch, instead just normalize
# an individual token. How do you know you're normalizing each token
# appropriately? By using the same normalization parameters on every token,
# which are learned by the nn

# DROPOUT
# Dropout: A Simple Way to Prevent Neural Networks from Overfitting
# Srivastava, Hinton, Krizhevsky, Sutskever, Salakhutdinov
# https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
# Wow, this Hinton guy seems like a big deal if he's in multiple
# of these papers. Lemme check out his CV... Wowza! How impressive. 
# Randomly shut off some neurons every forward & backward pass
# Regularizes the model 






In [58]:
# NEXT STEP: Create an encoder for the full encoder-decoder model
# The mask that makes the attention block auto-regressive is 
# what makes it a good decoder block—it figures out to say next by what's been said
# But what if we want to condition the attention block on extra information?
# For example, on some other input, maybe in another language (e.g. Fr->Eng)?
# Then you need to add an encoder that feeds into the transformer 
# With that done, we will have the entirety of a Generative, Pretrained, Transformer

# What is ChatGPT versus the output of this?
# "Alignment" is the only difference.
# How does "Alignment" happen? 
# Reinforcement Learning. Specifically, Proximal Policy Optimization
# Step 1: Fine-tune on human-curated data - starting policy
# Step 2: Have raters rate responses - reward signal is learned from good responses
# Step 3: Use PPO to train the agent how to best respond - train RL
# Proximal Policy Optimization Algorithms
# Schulman, Wolski, Dhariwal, Radford, Klimov
# https://arxiv.org/pdf/1707.06347

# See also:
# https://arxiv.org/pdf/2005.14165