In [29]:

#imports
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------


# Initialize a seed for generating random numbers
torch.manual_seed(1337)


#Loading the training data
#1337 is common to use, but it would be anything. 
inputText = 'LilWayneLyricsTrainingSet.txt'
with open(inputText, 'r', encoding='utf-8') as f:
    text = f.read()

# Set the unique charcters to variables. List() gets only unique
chars = sorted(list(set(text)))
vocab_size = len(chars)


# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(text[:10])
print(encode(text[:10]))
print(decode(encode(text[:10])))

Ahh, Hot-H
[23, 55, 55, 7, 1, 30, 62, 67, 8, 30]
Ahh, Hot-H


# Example of Encode Decode

### Input
- print(text[:10]) - Here i tring the first 10 characters of the training set
- print(encode(text[:10])) - Here I encode and print the first 10 characters
- print(decode(encode(text[:10]))) - Here I decode the encoded characters
### Output
- Ahh, Hot-H
- [23, 55, 55, 7, 1, 30, 62, 67, 8, 30]
- Ahh, Hot-H

#### You can see from this example how encoding and decoding the characters worksH


In [30]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)

# here we set the training size to 85% of the data set, and the validation data to 15% of the data set
n = int(0.85*len(data)) 
train_data = data[:n]
val_data = data[n:]



# Here we are loading the data
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data

    #This is a random starting point
    randInt = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in randInt])
    y = torch.stack([data[i+1:i+block_size+1] for i in randInt])
    x, y = x.to(device), y.to(device)
    return x, y



#disable gradient compoutation, which helps with efficiency
@torch.no_grad()


def estimate_loss():
    #initialize dictionary
    out = {}
    #Set the model to evaluation mode which affects the dropout and batchnorm,
    model.eval()

    #Here we loop over the training and validation datasets
    for split in ['train', 'val']:
        #The losses tensor stores losses for each iteration
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            #gets a batch of data
            X, Y = get_batch(split)
            #pass forward through the model to get logits and loss
            logits, loss = model(X, Y)
            #store loss
            losses[k] = loss.item()

        #Calculate and store the man loss
        out[split] = losses.mean()
    #set the model back to train
    model.train()
    return out


# Represents a HEad of a self-attention mechanism
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()

        #creates the linear layer key
        self.key = nn.Linear(n_embd, head_size, bias=False)

        #creates the linear layer for query
        self.query = nn.Linear(n_embd, head_size, bias=False)

        #creathes the linear layer for value
        self.value = nn.Linear(n_embd, head_size, bias=False)

        #registers a buffer named tril which is a lower triangular matrix of block_size x block_size
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


#defines the model
class WayneLLM(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table

        #creates an embedding layer for the tokens, mapping each token in the vocabulary to a high-dimensional vector
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        #embedding layer for token positions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        #A sequence of transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        #layer normalization applied to the final output
        self.ln_f = nn.LayerNorm(n_embd) 

        #linear layer to project from the dimensionality of embeddings to the vocabulary size
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # Applies a custom weight initialization function to all modules
        self.apply(self._init_weights)

    #custom weight initialization
    def _init_weights(self, module):
        #normal initialization for weights of linear layers
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                # zero initialization for biases
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            # Normal initialization for weights of embedding layers
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    # Forward pass of the model
    def forward(self, idx, targets=None):
        B, T = idx.shape #batch size B and sequence length T

        # idx and targets are both (B,T) tensor of integers
        # get embeddings for each token and position
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb #Sum token and position embeddings (B,T,C)
        x = self.blocks(x) # pass through transformer blocks(B,T,C)
        x = self.ln_f(x) #Apply final layer normalization (B,T,C)
        logits = self.lm_head(x) # project to vocabulary size (B,T,vocab_size)


        #compute loss
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    #Generates new tokens given an initial context
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))

10.712141 M parameters
step 0: train loss 4.5333, val loss 4.5342
step 500: train loss 1.9465, val loss 2.1218
step 1000: train loss 1.7172, val loss 2.0260
step 1500: train loss 1.5511, val loss 1.9526
step 2000: train loss 1.4030, val loss 1.9767
step 2500: train loss 1.2649, val loss 1.9471
step 3000: train loss 1.1550, val loss 1.9772
step 3500: train loss 1.0566, val loss 2.0005
step 4000: train loss 0.9579, val loss 2.0307
step 4500: train loss 0.8836, val loss 2.0557
step 4999: train loss 0.8089, val loss 2.1306

Your lubressedle spitt drink 22" writing
Now, go Dirtore the han ble fuck and Cita—Coup an(T have I havi!
Phat eest, I with I'm bear
Plaze I go fuck 'Wayne is build, bitch
Some say sweet him, chrome and the night She neez the can suite me, bitch
And I'mma blazing ling Goding, but I aing baby di times ocen the car angel?
When  gave everybody it will don't reder so be check
Shood go gonna be? So h, drop it like it like itch
Savide it's hitch the being sunring carked up
Yo

## First Output:
LtS  oosbE jednjRf tte sogyoeer ti I thrusozgd toht roit miltl outo nrudruId, gskthRfo ; hBKchausis Jo unt t I,e cratQde odigddA
coy
ywHTl I dd y
xlIrnEHlnata'the wal
Fd&I kthf
afeelyad T tuorin,rth theCrNrpllrmo YJherbthes;Codr s, wlu rooriNTiliconkcater.ald
h, er rle sonot Is owirbly lhit wplithesnwrus: I t.I taRyvvusn.e thues the, rt
y,tgndrirdn n$y s$
Bortr.
Yt sle lrctB tot sreerut polSh AenortyatTranulhvuthey'lot'oomedO, and oglawiithticrera:m.Y;odt
t wadidrE:oqhe?m wtopy
zun wt,oigthe d r t, minos oreFybourre e Bod sT giwoBsnstha!Ps Aodsthan wl gs lVxylhfor bllnocI alm.Ac;y ngris tha,sts, g rTrpistis'ped Mge tifdl tgroue c&d uldPv poNTheso yotQ,yrrleltJod$ agrw HeedY; averist Bkmmo? M whangh :ond&e wH grrfhi'ein Gw wthfatM od y pess Td thenom thasRYw wharyohi,eosd  sis ouao r wNoTus trlre am:Rn.OaukZra3Dluritor-s Q qZfdcsf:e shV
tpd nghrtthoshouEncw tiXt ; ontinsuson t bgs gt :e touooErit t mhT't.
-s e thOo  Ieust i t s craeree lofdsore Ou
SntI, t trazw,Viscea,,J m, IaEin.Ialn OcbbzoRaPk bsthNikr'! bhdiudorlt3 gcrn gri!sthecd wyIRhoGt ilu it pSdroy.? g klor e I'o TimYwdo;d tetg thagsr crt,
l Lese, ldrmoM bSKnd sths
UAd
Gol wyOiiursheplre gatin Q &dl aSUf i,tdnous&s.esit,Kd
Aso Og 'th t
Jcd re.Wygo hir aHVdTherdRJ vn! teth oHocuiGthesoo: tcrt AimhwTUgc;. t watr stheathe ithe$. rint
mufhaW tu&rteli'apaidnp !scur Loan-clkgliothiCd l rg oiue chd thpmog mesdsriuanreorthadnDd atrem dsr s kssl Itholto utmrer otahecralTod,ssd
Mlll r t sdc?oil; thedruD CgatwThdy :.inI bl unthayo

d er pthalonrk thuF f.
TheLusinrourilmuorhllYoharlar Kskanteraa&mu,F' baead nhethoDetS e ikiCn, ind.i$ ';Al jhil  ws Thp d !rdt DngrtCIn d: tdATvf kKnaiMipe'sthsIh'aiuthainorat :,ime srlgumis todond fg BhmongyotheowhtheuNwu;oumeguy moiprsn miun w oteg s&l oul: waoaLturkul.cfdfd::r

irdy t; khI l

T wecotnoAer iToy lt sset oarat sarator;h thhemedrsld ogyd arno uiu ae 
st miZ n$es
re tidVdro a,s w yecothdr f
ss thavfgoogrewnwr s hori Uyeilowed wrionouAav.e y COa!,YohmonomYsysNnthorthre ?hanc li

# Initial training with shakespear dataset
## Here is the parameters losses
 - 10.702913 M parameters
 - step 0: train loss 4.1498, val loss 4.1521
 - step 500: train loss 2.1906, val loss 2.2049
 - step 1000: train loss 2.0205, val loss 2.0848
 - step 1500: train loss 1.9077, val loss 2.0109
 - step 2000: train loss 1.8310, val loss 1.9509
 - step 2500: train loss 1.7718, val loss 1.9182
 - step 3000: train loss 1.7225, val loss 1.8872
 - step 3500: train loss 1.6758, val loss 1.8286
 - step 4000: train loss 1.6619, val loss 1.8261
 - step 4500: train loss 1.6265, val loss 1.7787
 - step 4999: train loss 1.6132, val loss 1.7725


## Here is the ouput of 500 characters
```
GLOUCESTER:
Yet got there speak; glittless, I donne's fatter
Of our peoffort; a lathlike grust time.
Dawerst your me, as decraid her day your wind she fet vence pleason senders.

WAGREMNENIA:
My lord: growes yould, No,
Excurdease, a let yet so take he fears;
And help's figer, being in plearies'.

GLOUCESTER:
Fortwe though and my all plucier
Sity becone cannot, which nor let she but donoth thee rangerous othand that
What the revenisons and my bedin, not so goods,
Andim fiveing the gentle eaven wa
```

# Initial training with Lil Wayne Dataset

## Here are the parameters and losses:

- 10.712141 M parameters
- step 0: train loss 4.5333, val loss 4.5342
- step 500: train loss 1.9465, val loss 2.1218
- step 1000: train loss 1.7172, val loss 2.0260
- step 1500: train loss 1.5511, val loss 1.9526
- step 2000: train loss 1.4030, val loss 1.9767
- step 2500: train loss 1.2649, val loss 1.9471
- step 3000: train loss 1.1550, val loss 1.9772
- step 3500: train loss 1.0566, val loss 2.0005
- step 4000: train loss 0.9579, val loss 2.0307
- step 4500: train loss 0.8836, val loss 2.0557
- step 4999: train loss 0.8089, val loss 2.1306
### Here is the output from 500 characters:
```
Your lubressedle spitt drink 22" writing
Now, go Dirtore the han ble fuck and Cita—Coup an(T have I havi!
Phat eest, I with I'm bear
Plaze I go fuck 'Wayne is build, bitch
Some say sweet him, chrome and the night She neez the can suite me, bitch
And I'mma blazing ling Goding, but I aing baby di times ocen the car angel?
When  gave everybody it will don't reder so be check
Shood go gonna be? So h, drop it like it like itch
Savide it's hitch the being sunring carked up
Young Srite kur you wanna fu

```