# Import Library and Requirement Listing

### Import Library

In [99]:
import torch
import torch.nn as nn # Neural Network 
from torch.nn import functional as F # Torch Deep Learning Model

### Requirement Listing

In [100]:
with open('requirements.txt','w') as file:
    import sys
    import matplotlib as mpl
    import numpy as np
    import pylzma
    import ipykernel
    import jupyter
    import torch
    
    file.write(f"""
            #python{sys.version}
            # Using Virtual environment
            matplotlib=={mpl.__version__}
            numpy=={np.__version__}
            pylzma=={pylzma.__version__}
            ipykernel=={ipykernel.__version__}
            jupyter
            torch=={torch.__version__}
            """)
    

# Metadata

In [101]:
%%time
block_size = 10 # Length of integer
batch_size = 5 # How many those model run on pararel
epochs = 2
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250 # How many iteration those model to evaluate
dropout = 0.2 # Create dropout metadata that drop neuron on deep learning pytorch so ther's no overfit
n_embd = 384 # Create length of generated section each values
n_layer = 4 # Create layer for gpt decoder because gpt-model need 4 decoder

CPU times: total: 0 ns
Wall time: 0 ns


### See Device

In [102]:
# See the device
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


# Data Loading

In [103]:
with open('wizard_of_ox.txt','r',encoding='utf-8') as f:
    text =  f.read()
# Get Unique Character from 2000 last index
chars = sorted(set(text))
print(chars)
# Get Vocab_Size
vocab_size = len(chars)
print(len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


### Create Tokenizers by Words

In [104]:
strtoint = {ch:i for i,ch in enumerate(chars)} # Encoding Purpose
inttostr = {i:ch for i,ch in enumerate(chars)} # Decoding Purpose

print(f"strtoint:{strtoint}")
print(f"inttostr:{inttostr}")

strtoint:{'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '?': 24, 'A': 25, 'B': 26, 'C': 27, 'D': 28, 'E': 29, 'F': 30, 'G': 31, 'H': 32, 'I': 33, 'J': 34, 'K': 35, 'L': 36, 'M': 37, 'N': 38, 'O': 39, 'P': 40, 'Q': 41, 'R': 42, 'S': 43, 'T': 44, 'U': 45, 'V': 46, 'W': 47, 'X': 48, 'Y': 49, 'Z': 50, '[': 51, ']': 52, '_': 53, 'a': 54, 'b': 55, 'c': 56, 'd': 57, 'e': 58, 'f': 59, 'g': 60, 'h': 61, 'i': 62, 'j': 63, 'k': 64, 'l': 65, 'm': 66, 'n': 67, 'o': 68, 'p': 69, 'q': 70, 'r': 71, 's': 72, 't': 73, 'u': 74, 'v': 75, 'w': 76, 'x': 77, 'y': 78, 'z': 79, '\ufeff': 80}
inttostr:{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '&', 5: "'", 6: '(', 7: ')', 8: '*', 9: ',', 10: '-', 11: '.', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '?', 25: 'A', 26: 'B', 27: 'C', 28: 'D', 29:

### Create Encoder and Decoder

In [105]:
encode = lambda x:[strtoint[c] for c in x]# Tranform data to string
decode = lambda y: ''.join([inttostr[c] for c in y])# Convert array to string to decode

In [106]:
test = text[:200]
len(encode(test))

200

In [107]:
print(torch.tensor(encode(text),dtype=torch.long)[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


### Train Validation Definition

In [108]:
# Convert encoded text into tensor and Get 50% from the evaluation
data = torch.tensor(encode(text),dtype=torch.long)[:int(0.5*len(text))]
# Get 80% of data for training and 20% for test
n = int(0.8*len(data))
# Split into train and test
train_data = data[:n]
validation_data = data[n:]

### Split to Feature and Target

Overview:

BigGram Model are predicting next words based on previous words, so the feature are previous words and target are next words after previous words.

In [109]:
# Function
def get_batch(split='validation'):
    # Get Data
    data = train_data if split.lower() == 'train' else validation_data
    # Generate Index
    rng_index = torch.randint(len(data) - block_size, (batch_size,))
    # print(rng_index)
    # Get Feature and Target
    x = torch.stack([data[i:i+block_size] for i in rng_index])# Get data from iteration to iteration + block size
    y = torch.stack([data[i+1:i+block_size+1] for i in rng_index])# Get data from iteration to iteration + block size
    # Use Device
    x = x.to(device)
    y = y.to(device)
    
    return x,y

In [110]:
# Get Train Validation feature and Target
X_train, y_train = get_batch('train')
X_val, y_val = get_batch('else')

In [111]:
for i in range(block_size):
    context = data[:i+1]
    target = data[i]
    print(f'when input is {context} target is {target}')

when input is tensor([80]) target is 80
when input is tensor([80,  1]) target is 1
when input is tensor([80,  1,  1]) target is 1
when input is tensor([80,  1,  1, 28]) target is 28
when input is tensor([80,  1,  1, 28, 39]) target is 39
when input is tensor([80,  1,  1, 28, 39, 42]) target is 42
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is 39
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is 44
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44, 32]) target is 32
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49]) target is 49


# Creating Big Gram Model

### Overview

BigGram Model relies on probability and occurence of character each next steps. in nutshell:
- Suppose 'hello world'
- the steps is h-e-l-l-o-' '-w-o-r-l-d
- the question is 'how much occurrence that h meet e, e meet l, l meet l etc.'
- the chance will be calculated and updated every train

### Loss Estimator Function

In [112]:
# @torch.no_grad()# Disable GradientDescent
# def estimate_loss(eval_iters):
#     out = {}
#     # Set Model to Evaluation Mode
#     model.eval()
#     for split in ['train','val']:
#         # Create Tensor with length of 'eval_iters'
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             # Create dataset
#             X, y = get_batch(split)
#             # Extract logits and loss
#             logits, loss = model.foward(X, y)
#             # Stored loss on losses tensor
#             losses[k] = loss.item()
#         # Calculate the mean from the losses and stored to output
#         out[split] = losses.mean()
#     # Set Model to Train Mode
#     model.train()
#     return out

### Model Definition(Class)

In [113]:
# class BigGramLanguageModel(nn.Module):# Create Neural Network with Gradient Descent for loss optimization
#     def __init__(self, vocab_size):
#         #Get Parent properties
#         super().__init__()
#         # Create Embbeded Layer 
#         '''
#         Embedded Layer being used for translating the character, text, etc by using vector to see how near or far the feature to target. the vector in Embedding torch are already being created on torch, so this function will only referencing the vector from pre-built embedding table.
        
#         Embedding Table contain probability of character combination, like how 'bi' has 91039 occurence in million of data
#         '''
#         self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
    
#     # Foward Function
#     def foward(self,index,target=None):
#         logits = self.token_embedding_table(index)
        
#         if target is None:
#             loss = None
#         else:
#             batch_dim,time_dim,channel_dim = logits.shape
#             # Reshape for cross entropy evaluation
#             logits = logits.view(batch_dim*time_dim,channel_dim)
#             target = target.view(batch_dim*time_dim)
#             loss = F.cross_entropy(logits,target)
        
#         return logits, loss
    
#     # Generate Function
#     def generate(self,index,max_new_tokens):
#         # index is (batch_dim, time_dim) array of indices in current context
#         for _ in range(max_new_tokens):# Loop without retreive any value from range
#             # get prediction
#             logits, loss = self.foward(index)
#             # focus only on last time step
#             logits = logits[:, -1, :] # becomes (batch_dim, channel_dim)
#             # apply softmax to get probabilities
#             probs = F.softmax(logits, dim=-1) # (batch_dim, channel_dim)
#             #sample from distribution
#             index_next = torch.multinomial(probs, num_samples=1) # (batch_dim, 1)
#             # append sampled index to running sequence
#             index = torch.cat((index, index_next), dim=1) #(batch_dim, time_dim + 1)
#         return index

### Model Definition(Model)

In [114]:
model = BigGramLanguageModel(vocab_size=vocab_size)
m = model.to(device)

In [115]:
for i in model.parameters():
    print(i)

Parameter containing:
tensor([[-1.8122, -0.4277, -0.4958,  ...,  1.3422,  0.2597,  0.2347],
        [-0.0547, -0.6150, -0.0898,  ..., -0.3147,  1.2274, -0.1767],
        [ 0.9265,  1.3961,  1.0511,  ..., -1.1287, -0.2557, -0.6895],
        ...,
        [ 0.8764,  0.1055, -0.1903,  ...,  0.6797, -0.3065, -0.2258],
        [-0.2725, -0.3512, -0.1652,  ...,  1.1548,  0.1308,  0.5053],
        [ 0.0677,  1.5344,  1.5184,  ...,  1.9445,  1.0363, -0.2777]],
       requires_grad=True)


Notes: .parameters() are callback function that present if class inherit nn.Module. 

### Generating Chars

In [116]:
# context = torch.zeros((1,1), dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
# print(generated_chars)


ycSYq8I3*?SpL[DOqV5NfMCnlVzOet*3 cJRGbfej
wK:1:d﻿Q'
q10-W5R:fMMTb7OKv_EN"9Whp9Tnx1Uh"WN3﻿-*5IabX]b
GTnH?2 ch﻿-C5P8-Vjk&u)-B1vq9i8t5;0o?2f1 NRdL"YzEPX(vwJ)_VjKR51)P
Jt
JIl)H.T3:?-l:1ijzA,Nf!?2BX&-O:PuwKx,adY1M!P8EtPlvD8c[﻿HKN;VHJZXEbu2F,GO)0H,;
w3D-s[CLm;x9 cJ XwomXCRT:4
:dNJ8YRfMVgo)(d:wJRl(VpC,sYQ&EYT(&a9lcl7anStTYL"R9IqM*AHXlDfV- c;!'dQv!RJGd4W5fM9k9,m.(S85iHnBw" 9yjek49;lU'2 N5yiw)_&3pfs(*3'0QwtD-sS1mcV07jI32 i132fGQ)cJ,;_0Vj3'qNIll)C_N﻿n3p;Q5jq!ZBnSYTn
7d:4d﻿zn5A! chanBGb9*rvP5fQyj]e3'X&,jkD


### Optimizer Definition

In [119]:
# %%time
# optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

# for x in range(10):
#     print(f"batch:{x}")
#     for iter in range(max_iters):
#         if iter % eval_iters == 0:
#             losses = estimate_loss(eval_iters)
#             print(f"""step:{iter}, loss train:{losses["train"]:.4f}, loss val:{losses["val"]:.4f}""")
            
#         # sample a batch of data
#         xb, yb = get_batch('train')
        
#         # Evaluate the loss
#         logits, loss = model.foward(xb,yb)
#         optimizer.zero_grad(set_to_none=True)
#         loss.backward()
#         optimizer.step()
#     if loss.item()<1:
#         break
# print(loss.item())
    

batch:0
step:0, loss train:4.9846, loss val:4.9599
step:250, loss train:4.9073, loss val:4.9134
step:500, loss train:4.8408, loss val:4.8157
step:750, loss train:4.7691, loss val:4.7683
step:1000, loss train:4.7202, loss val:4.6988
step:1250, loss train:4.6365, loss val:4.6052
step:1500, loss train:4.5632, loss val:4.5729
step:1750, loss train:4.5033, loss val:4.4992
step:2000, loss train:4.4544, loss val:4.4202
step:2250, loss train:4.3790, loss val:4.3656
step:2500, loss train:4.3370, loss val:4.3203
step:2750, loss train:4.2794, loss val:4.2702
step:3000, loss train:4.2097, loss val:4.1976
step:3250, loss train:4.1528, loss val:4.1429
step:3500, loss train:4.0873, loss val:4.0762
step:3750, loss train:4.0419, loss val:4.0250
step:4000, loss train:4.0054, loss val:3.9731
step:4250, loss train:3.9444, loss val:3.9325
step:4500, loss train:3.8876, loss val:3.8774
step:4750, loss train:3.8458, loss val:3.8201
step:5000, loss train:3.8009, loss val:3.7790
step:5250, loss train:3.7563, lo

KeyboardInterrupt: 

In [120]:
# context = torch.zeros((1,1), dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
# print(generated_chars)


whep, m foon d plino



s  ssoteand thesthos. ughesealinolan PThestar an'vouienong.
pounto


th,
thed oned beng bed  p bolierotleake I there.]
ano we omo h bore s.]
Thindint sat tit m." ndo thokinovelof thierey  thy'tizand co  s
hasis.



medey f g Jiee ted s, fiofomuthou, stl wethanbet ord APIlighy tr tranb  omeer wasthig gasoted akim ccallyoop
" rowhe Pirentr the wizingalyored "Whain ndef-'r ase ZAProd wilusutourof tenso d armoond thr he t hry g. sereang s toy stooumony, fo e

"Bus oreve d BOh
