In [36]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4  #How many blocks we will do in parallel

max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cpu


# Encode and decode text with pytorch

In [37]:
with open ('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)

vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [38]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [39]:
encode("Morning")

[36, 63, 66, 62, 57, 62, 55]

In [40]:
len(text[:100])

100

In [41]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([75, 27, 63, 66, 63, 68, 56, 73,  1, 49, 62, 52,  1, 68, 56, 53,  1, 46,
        57, 74, 49, 66, 52,  1, 57, 62,  1, 38, 74,  0,  0,  0,  1,  1, 24,  1,
        29, 49, 57, 68, 56, 54, 69, 60,  1, 41, 53, 51, 63, 66, 52,  1, 63, 54,
         1, 43, 56, 53, 57, 66,  1, 24, 61, 49, 74, 57, 62, 55,  1, 24, 52, 70,
        53, 62, 68, 69, 66, 53, 67,  0,  1,  1,  1,  1, 57, 62,  1, 49, 62,  1,
        44, 62, 52, 53, 66, 55, 66, 63, 69, 62])


In [42]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[:n]

# Bigram model model

It is a type of statistical language model that predicts the probability of a word in a sequence based on the previous word

In [43]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[ 1, 68, 56, 53,  1, 50, 53, 49],
        [ 1, 68, 56, 53,  1, 51, 63, 61],
        [49, 51, 59,  1, 54, 66, 63, 61],
        [ 1, 65, 69, 49, 62, 68, 57, 68]])
targets:
tensor([[68, 56, 53,  1, 50, 53, 49, 66],
        [68, 56, 53,  1, 51, 63, 61, 50],
        [51, 59,  1, 54, 66, 63, 61,  1],
        [65, 69, 49, 62, 68, 57, 68, 73]])


## Initialize Bigram model

### Model conception

#### Functions

- **forward()**: It is used to understand the underlying operations of our model. You get to see how the input data is transformed step by step through the layers of the network until it putputs the results. She deine operations when passing input data to the model. In this cas, we take an idex as input  corresponding to the tokens of a sentence and his target. We return the predicted logits for each token and the loss is the targets are done.
- 

In [44]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



l8bqNlcxHBqP﻿,HFbH;:u!LVj0t?7STDB!Op?KD:Fg8l5S.jKxEH5SoSarbc-;pT)vESj4D2J8ERa-8Sky6h
YKZpGi)WhT"&Db-.whAw.c3a"t15UmGz5C7TSutK: Kd72rAnt;CyWt)Vea&.7,R(c8t;zVwYf?"t
m-Y:y4n(H!xgF﻿ZW2q(F'QjqKK7SShKCRRe"tFuhAH?THg﻿l4:;Os:IFmm5&OU:uC50g07mg:qpGh&GDAqrhbdE)﻿sFw'Q9K7B﻿JCk&jp&h:iM602Y00(Y-KWggPJyR;x.:GzastvyiB﻿:;!iD-88
Iqm6yPaqvH?zJC8lV2dsWdshp﻿F'5gPps9"2psDCnAapRd4aky﻿mWC76"iTdNZbWdP0f.hj0( A:d!S2s9,pT,tIC77VFQzd;eaY4LK:0KuDT46LSzfQ)"ZlJ(4LLgS2qanoZhp'q2rU07JCQzVtv)lw﻿jNQJ0k)W(?,h ,f!yU&rJ9Cp)WjGI﻿eYU&


In [45]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [46]:
estimate_loss()

{'train': tensor(4.9559), 'val': tensor(4.9588)}

In [47]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.968, val loss: 4.986
step: 250, train loss: 4.900, val loss: 4.918
step: 500, train loss: 4.837, val loss: 4.841
step: 750, train loss: 4.778, val loss: 4.790
4.527280807495117


### Optimizers

1. ***Mean Squared Error (MSE)***: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
2. ***Gradient Descent (GD)***: is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
3. ***Momentum***: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimizer to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks.
4. ***RMSprop***: RMSprop is an optimization algorithm that uses a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.
5. ***Adam***: Adam is a popular optimization algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimizer for deep learning models.
6. ***AdamW***: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. This helps to regularize the model and can improve generalization performance. We will be using the AdamW optimizer as it best suits the properties of the model we will train in this video.

In [48]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


T.Wrlr(TTJuVkA'u?gSa3Fy5SU)K"W3"IErbHLY9rp﻿(OmagmLqe"B4um"tU4d4f)L
nt5lVJ3Zh88na
d6aM&mJ,pI(mMErJv,f5 kyGhTjNV?) PJy﻿
.;8RS&!vPLMLsp'dWdIFa?c.wfz6pl02qc7nhSJ&QQYi !SFBLsJut;bbH!J)ZpqAaAE.j.V5VTwz
Zqu15
LyA9jogsgSap-lM6kH60O"t2rZCx?&Ti﻿k-D Ky;H?4eY9))!SY7mSk2IM9D﻿oyH!8RdKZMR"IkeaCRdSaYiU&U&fC',I7bx.hTFfk2Il?Br03CyBsTFL
M2IgvTKBQNI3sdg-:iM6A(,!S8RG)bR
-gS-7E6
Jd5,9sST-gDAtsACbt)!7P﻿d00gSC39AcRZ,'lSOD(F-8Rk'qKS150gOC88R;(mR8g9Sk?vTM6OzqPnPeama4B2&Ui3pG&Wplw'j14﻿x6osb0dYoNQalBLs
"'G?"PLV9L
,YdSah777


In [30]:


x =  train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when inout is", context, 'traget is', target)


when inout is tensor([75]) traget is tensor(27)
when inout is tensor([75, 27]) traget is tensor(63)
when inout is tensor([75, 27, 63]) traget is tensor(66)
when inout is tensor([75, 27, 63, 66]) traget is tensor(63)
when inout is tensor([75, 27, 63, 66, 63]) traget is tensor(68)
when inout is tensor([75, 27, 63, 66, 63, 68]) traget is tensor(56)
when inout is tensor([75, 27, 63, 66, 63, 68, 56]) traget is tensor(73)
when inout is tensor([75, 27, 63, 66, 63, 68, 56, 73]) traget is tensor(1)


In [None]:
We are goind to state of the art language model and how wa can build them from strach and pre-train them