In [6]:
import torch
import torch.nn as nn #for inializing the neural network

In [7]:
import torch.nn as nn
from torch.nn import functional as F

In [8]:
device = "gpu" if torch.cuda.is_available() else "cpu"
block_size = 8 #length of the integer
batch_size = 4 #how many blocks running in parallel
device
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

**Vocab size:** How many unique characters in our dataset.

In [9]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [10]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [11]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[74, 65, 57,  1, 64, 67, 68, 76],
        [67, 57,  1, 61, 58, 65, 57,  1],
        [76,  1, 50, 58, 55,  1, 72, 62],
        [71, 55, 65, 58,  1, 76, 54, 62]])
targets:
tensor([[65, 57,  1, 64, 67, 68, 76,  1],
        [57,  1, 61, 58, 65, 57,  1, 72],
        [ 1, 50, 58, 55,  1, 72, 62, 73],
        [55, 65, 58,  1, 76, 54, 62, 67]])


**nn.Linear(3,3,bias=False)** -: It is basically performing linear transformation. It's always to make sure that the nodes are lining up between input and output.
3,3 indicates input layer has 3 nodes and ouput layer has 3 nodes

weight * x + bias = y where x is input layer and y is output layer

For additional info go to -> https://pytorch.org/docs/stable/nn.html

**Softmax:** the softmax applies the standard exponential function to each element
of the input vector (consisting of real numbers), and normalizes these values by dividing by the sum of all these exponentials.

**nn.Embedding:** The length of the embedding is based on the vocab size and the embeddings are generated at the character level

**Loss:** loss is calculated by taking the negative log likelihood. For instance, -ln(1/80)

**Gradient descent:** gradient descent is used to change the weight matrix in a neural network basically to make the network better in such a way to reduce the loss and improve prediction accuracy.calculate the derivative(slope) from the current and move in a right direction based on that.
   
   Different Optimizers: Adamax,SGD,SparseAdam,Adamw
  
**Learning rate:** Lets, say now we know direction we want to move, LR basically tells us the step size, how fast the move in that direction is. Smaller learning rates train the neural network better compared to the bigger ones.

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [18]:
class BigramLargeModel(nn.Module):
      def __init__(self,vocab_size):
          super().__init__()
          self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
          #logits are bascally the probability distribution of what we want to predict next
      def forward(self,index,targets=None):
          logits = self.token_embedding_table(index) # .shape, .view are basically to unpack and reshape the tensor objects
        #The logits are basically the probability distribution of what we gonna predict next
          if targets is None:
                  loss = None
          else:
                  B, T, C = logits.shape  #c -> channels -. vocan size, #T - the next prediction, #B -> batch
                  logits = logits.view(B*T, C)
                  targets = targets.view(B*T)
                  loss = F.cross_entropy(logits, targets)

          return logits, loss

      def generate(self, index, max_new_tokens):
            # index is (B, T) array of indices in the current context
            for _ in range(max_new_tokens):
                # get the predictions
                logits, loss = self.forward(index)
                # focus only on the last time step
                logits = logits[:, -1, :] # becomes (B, C)
                # apply softmax to get probabilities
                probs = F.softmax(logits, dim=-1) # (B, C)
                # sample from the distribution
                index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
                # append sampled index to the running sequence
                index = torch.cat((index, index_next), dim=1) # (B, T+1)
            return index

model = BigramLargeModel(vocab_size)
m = model.to(device)

context = torch.zeros((2,2), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



atR7_3WLnf[
O;WM5hgQG!LlP6wUuui1ai wWv(0:[J0YupN
KgMv?48
UtJjoJ?B VM5pS
bj]dqBw)3&bIR
cN6wM5Pia
&iiaO*DTQ
H*RXnRdFN'Dm?﻿?DfsyHN82682Hm.?Q3:J72K3dbj(Y;A.6rnO;82[iqaWaWTs6oxpWIZs w4F.ZPOGd7nn?WA 2zh5]*WEt1t9j]xH﻿i_M1kGNY3&H6a59em,]﻿K'
9(NmNqYK﻿s V*uap
21e,rJ3mjS"PJt9_MoP_u6Ak!RMEh*wx2gm*RWZ:'tF2FKLc6RY2pFcd4Ys"RkeF7XoS?bp1a7keiKYskj]9ra0AP6NGdb]4(lW5QUQLY_,]_POYS5;-,?WKG_CH82'﻿59U*H9&. &jZCtyhp
0u5ow*D zVH5Cr2Av.0yv29:]"pgX4Vf&KISX,hCSr-srsPiD"P6"rQjKX!57TRZK7,kM3C18S9﻿,zbYs--7BfT
bMxhw21knwZXPv!3


In [19]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.907, val loss: 4.923
step: 250, train loss: 4.874, val loss: 4.850
step: 500, train loss: 4.793, val loss: 4.780
step: 750, train loss: 4.736, val loss: 4.731
4.866482734680176


In [20]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


1X]o﻿1fMxZUIBmiAvUH;]&KDO.EwpJ0UHW"hg0!]y6vHz)1ke&sPN)?UUrLf&R.
M,.ROF7VOnfRFLYZpSXa-2.O?jI1tD)X4;4FUqGcdb4j4YExB3!)Xu r8.Y&IA goPP0cMTKzcunA0(IWjS,Jz-2me&LYzX60)M-'rCDjrL(mEwkeYuQHsT
bXoV(u2P'b4Wp0o1HO1Aq V;-]Lr?kwiqV[(coj]3C9OA m(M!m7-59xCrJBSpkehgl8&5CEt5cNq]72wBRZT K7,;UQj'z"2jo﻿13sKSryWWDJ,(3XNy5Cn&A[ilBma?oPDZTf:
0mLR.ZlvA8Z*B&i.!1-vo'qu5J)_UV _RtPmya3htDcQNHZAvSr(3C5tUQi&IXo0Wy,tD-0WzX8VG1AFKGYH)zuYwnEkD9"dH9q.Ell"*RKnEa
i:
cQ2W"A9j.ZGO﻿EK,'Jo﻿-x5WUQ
bVNmPUi2m(m  r'K_ 'e4YCXlXoQH;S2wBAQjS
