### /* A bigram language model predicts the next word or character based on the previous one. It learns probabilities from pairs (bigrams) in the training text. This simple approach captures short-range dependencies, making it useful for basic text generation and analysis, but it cannot model longer context or complex language patterns. */

### import 

In [2]:
import torch
from torch import nn as nn
from torch.nn import functional as F

device  = "mps" if torch.mps.is_available() else "cpu"
print(device)
block_size = 8 # -> the token size 
batch_size = 4 # --> how many are processing parallel
maxiter = 10000
learning_rate = 3e-4


mps


## open file

In [3]:
with open("/Users/akashbarpanda/Documents/model/PYLLM/data/The invisible man.txt","r",encoding="utf-8") as f:
    text = f.read()
    print(text[:500])


The Invisible Man

A Grotesque Romance

by H. G. Wells

CHAPTER I.
THE STRANGE MAN’S ARRIVAL


The stranger came early in February, one wintry day, through a biting
wind and a driving snow, the last snowfall of the year, over the down,
walking from Bramblehurst railway station, and carrying a little black
portmanteau in his thickly gloved hand. He was wrapped up from head to
foot, and the brim of his soft felt hat hid every inch of his face but
the shiny tip of his nose; the snow had piled itsel


In [4]:
char = sorted(set(text))
print(char)
print(len(char))
vocab_size = len(char)

['\n', ' ', '!', '(', ')', ',', '-', '.', '2', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'é', 'ê', 'ö', '—', '‘', '’', '“', '”']
74


### encoder decoder (converstion of string to respective char array index respectively)

In [5]:
# string_to_integer = { ch:i for i,ch in enumerate(char)}
# integer_to_string = {i:ch for i,ch in enumerate(char)}

# encode = lambda s: [string_to_integer[c] for c in s]
# decode = lambda l: ''.join([integer_to_string[i] for i in l])

# Build string_to_integer dictionary
string_to_integer = {}
for i, ch in enumerate(char):
    string_to_integer[ch] = i

# Build integer_to_string dictionary
integer_to_string = {}
for i, ch in enumerate(char):
    integer_to_string[i] = ch


# Encode function (string → list of integers)
def encode(s):
    result = []
    for c in s:
        result.append(string_to_integer[c])
    return result


# Decode function (list of integers → string)
def decode(l):
    result_list = []
    for i in l:
        result_list.append(integer_to_string[i])
    return ''.join(result_list)

print(encode("hello"))
print(decode([47,44,51,51,54]))

data = torch.tensor(encode(text) ,dtype=torch.long)
print(data[:100])
print(len(data))

[47, 44, 51, 51, 54]
hello
tensor([31, 47, 44,  1, 20, 53, 61, 48, 58, 48, 41, 51, 44,  1, 24, 40, 53,  0,
         0, 12,  1, 18, 57, 54, 59, 44, 58, 56, 60, 44,  1, 29, 54, 52, 40, 53,
        42, 44,  0,  0, 41, 64,  1, 19,  7,  1, 18,  7,  1, 34, 44, 51, 51, 58,
         0,  0, 14, 19, 12, 27, 31, 16, 29,  1, 20,  7,  0, 31, 19, 16,  1, 30,
        31, 29, 12, 25, 18, 16,  1, 24, 12, 25, 71, 30,  1, 12, 29, 29, 20, 33,
        12, 23,  0,  0,  0, 31, 47, 44,  1, 58])
271892


### train test splitting

In [12]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix  = torch.randint(len(data) - block_size,(batch_size,))
    #print(ix,) # this output is the index of encoded word of text(book )
    x = torch.stack([data[i:i+block_size] for i in ix]) # here i is the index of that word of book 
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device),y.to(device) # <<--add to run on gpu 
    return x,y

x,y = get_batch('train')
print("inputs:-> ")
print(x)
print("output:-> ")
print(y)

inputs:-> 
tensor([[43,  1, 47, 44, 57,  1, 47, 48],
        [44, 63, 42, 60, 58, 44,  1, 45],
        [ 0, 30, 54, 52, 44, 41, 54, 43],
        [54, 57, 40, 53, 43, 40,  5, 73]], device='mps:0')
output:-> 
tensor([[ 1, 47, 44, 57,  1, 47, 48, 58],
        [63, 42, 60, 58, 44,  1, 45, 54],
        [30, 54, 52, 44, 41, 54, 43, 64],
        [57, 40, 53, 43, 40,  5, 73,  1]], device='mps:0')


### block size char compare

In [7]:


x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is ", context,"target is ", target)

when input is  tensor([31]) target is  tensor(47)
when input is  tensor([31, 47]) target is  tensor(44)
when input is  tensor([31, 47, 44]) target is  tensor(1)
when input is  tensor([31, 47, 44,  1]) target is  tensor(20)
when input is  tensor([31, 47, 44,  1, 20]) target is  tensor(53)
when input is  tensor([31, 47, 44,  1, 20, 53]) target is  tensor(61)
when input is  tensor([31, 47, 44,  1, 20, 53, 61]) target is  tensor(48)
when input is  tensor([31, 47, 44,  1, 20, 53, 61, 48]) target is  tensor(58)


## BIGRAM Language Model 

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,index,targets = None):
        logits = self.token_embedding_table(index) 

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T , C) ## view helps to reshape the tensor 
            targets = targets.view(B*T) # type:ignore
            loss = F.cross_entropy(logits,targets)
        

        return logits , loss # logits are basically probability distibution(normalisation) of bigrams in [] form 


    def generate(self,index,max_new_tokens):

        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[ :, -1, :]
            probs = F.softmax(logits,dim=-1)
            index_next = torch.multinomial(probs,num_samples= 1)
            index = torch.cat((index,index_next),dim=1)
        return index


model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1),dtype=torch.long,device=device)
generated_char = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_char)
       


zcXk(kCLarCB)xWaQu:VaFfJm’ vpsdnnyiXFQcgnwhKG—WalHfqb,XUabpjlclbRêdjTm‘:g,Re]DV]xM;:fvoaEWpJ-YVlP),RJSuCp:e?aA.iF
:JK)X;lc—f,,B[ lHxVd’(]We.‘Ar’2NleöaD2‘kj]jq”pQCUT: rgSLY2,m;]!söt
t:gqzFrdmiFIFQM“”U
ug,kCb2eI’AjnjvqB(G?!p?GS!JDBTspT!eFxoTxME2X;rE2tjr2HvMLxD)’Rwd’fPA)Xk;RAj’D]IV[vwJS;REWdé DLDLêrxxjxyê“g?xCllytRgDêFIvR)t]BUê bbE—.?:?‘d L]x‘zb”KgyqB[u_[eEF.wyméSxcLOFI]’D’CRXhLo‘u
‘l pKJc[wêMh!)ê-—:-VQIOCpFF“bsKn2e2,dr.Yx]2CYTcJ-Y—aio?cHewöêJi2bvb?)AI2)’ osT2q2hé
pfnyXy:KeV-,?aXWaL]!êSubJUqFuz?oRw


### optimiser

In [13]:
optimiser = torch.optim.AdamW(model.parameters(),lr=learning_rate)

for iter in range(maxiter):
    #sample batch of data
    xb ,yb = get_batch('train')

    # evaluate the loss
    logits,loss = model.forward(xb,yb)
    optimiser.zero_grad(set_to_none=True)
    loss.backward() # type: ignore
    optimiser.step()

print(loss.item()) # type:ignore
    


2.494572162628174


In [None]:
context = torch.zeros((1,1),dtype=torch.long,device=device)
##prompt = torch.tensor([encode(input("enter your prompt"))],dtype=torch.long,device=device)
generated_char = decode(m.generate(context,max_new_tokens=5000)[0].tolist())
print(generated_char)

hicaEPz
