In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import math

**DATA PREPROCESSING**

In [None]:
with open('dataset_2.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

In [None]:
text[:1000]

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l]) 

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

In [None]:
n = int(0.9*len(data)) 
train_data = data[:n]
val_data = data[n:]

In [None]:
batch_size = 8
seq_len = 8
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - seq_len, (batch_size,))
    x = torch.stack([data[i:i+seq_len] for i in ix])
    y = torch.stack([data[i+1:i+seq_len+1] for i in ix])
    return x, y

**MODEL CREATION**

In [None]:
d_model = 512 #The embedding dimension for the words
d_ff = 4*d_model #The number of neurons for the linear layers in the projection layer
heads = 32
dropout = 0.1
n_enc = 10 #This is the number of encoders we will use for the transformer

In [None]:
class InputEmbedding(nn.Module):
  def __init__(self , vocab_size: int , d_model: int):
    super().__init__()
    self.vocab_size = vocab_size
    self.d_model = d_model
    self.embedding = nn.Embedding(vocab_size , d_model)

  def forward(self , x):
    return self.embedding(x) * math.sqrt(d_model)

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [None]:
class NoMaskAttention(nn.Module):
  def __init__(self , heads: int , d_model: int , dropout: float):
    super().__init__()
    self.heads = heads
    self.d_model = d_model
    assert d_model % heads == 0 , "d_model is not divisible by the number of heads"
    #Note that the input being passed now has the shape (batch_size , seq_len , d_model)
    self.d_k = d_model//heads
    self.w_q = nn.Linear(d_model , d_model , bias=False)
    self.w_k = nn.Linear(d_model , d_model , bias=False)
    self.w_v = nn.Linear(d_model , d_model , bias=False)

    self.w_o = nn.Linear(d_model , d_model , bias=False)
    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(self , q , k , v): 
    attention_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)
    attention_scores = attention_scores.softmax(dim=-1)
    attention_scores = self.dropout(attention_scores)

    return (attention_scores @ v) , attention_scores

  def forward(self , q , k , v):
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)

    q = q.view(q.shape[0] , q.shape[1] , self.heads , self.d_k).transpose(1,2) #The shape becomes (batch_size , heads , seq_len , d_k)
    k = k.view(k.shape[0] , k.shape[1] , self.heads , self.d_k).transpose(1,2)
    v = v.view(v.shape[0] , v.shape[1] , self.heads , self.d_k).transpose(1,2)

    x , self.attention_scores = NoMaskAttention.attention(self , q , k , v)

    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_k)

    return self.w_o(x)

In [None]:
class Masked_Attention(nn.Module):
  def __init__(self , heads: int , d_model: int , dropout: float):
    super().__init__()
    self.heads = heads
    self.d_model = d_model
    assert d_model % heads == 0 , "d_model is not divisible by the number of heads"
    #Note that the input being passed now has the shape (batch_size , seq_len , d_model)
    self.d_k = d_model//heads
    self.w_q = nn.Linear(d_model , d_model , bias=False)
    self.w_k = nn.Linear(d_model , d_model , bias=False)
    self.w_v = nn.Linear(d_model , d_model , bias=False)

    self.w_o = nn.Linear(d_model , d_model , bias=False)
    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(self , q , k , v): #add the mask back to this
    attention_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)
    decoder_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
    attention_scores.masked_fill_(decoder_mask, -1e9)
    attention_scores = attention_scores.softmax(dim=-1)
    attention_scores = self.dropout(attention_scores)

    return (attention_scores @ v) , attention_scores

  def forward(self , q , k , v):
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)

    q = q.view(q.shape[0] , q.shape[1] , self.heads , self.d_k).transpose(1,2) #The shape becomes (batch_size , heads , seq_len , d_k)
    k = k.view(k.shape[0] , k.shape[1] , self.heads , self.d_k).transpose(1,2)
    v = v.view(v.shape[0] , v.shape[1] , self.heads , self.d_k).transpose(1,2)

    x , self.attention_scores = Masked_Attention.attention(self , q , k , v)

    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_k)

    return self.w_o(x)

In [None]:
class FeedForward(nn.Module):
  def __init__(self , d_model: int , d_ff: int , dropout: float):
    super().__init__()
    self.d_model = d_model
    self.d_ff = d_ff
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)
    self.linear1 = nn.Linear(d_model , d_ff)
    self.linear2 = nn.Linear(d_ff , d_model)

  def forward(self ,x ):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)
    return x

In [None]:
class ProjectionLayer(nn.Module):
  def __init__(self , d_model: int , vocab_size: int):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.linear = nn.Linear(d_model , vocab_size)

  def forward(self , x):
    return self.linear(x)

In [None]:
class Encoder(nn.Module):
  def __init__(self , InputEmbedding , PositionalEncoding , NoMaskAttention , FeedForward , d_model: int , dropout: float):
    super().__init__()
    self.input_embedding_layer = InputEmbedding(vocab_size , d_model)
    self.positional_encoding_layer = PositionalEncoding(d_model , seq_len , dropout=0.1)
    self.attention = NoMaskAttention(heads=8 , d_model = 512 , dropout=0.1)
    self.ffwd = FeedForward(d_model , d_ff , dropout=0.1)
    self.layernorm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self , x):
    x = self.input_embedding_layer(x)
    x = self.positional_encoding_layer(x)
    x = x + self.dropout(self.attention(self.layernorm(x) , self.layernorm(x) , self.layernorm(x)))
    x = x + self.dropout(self.attention(self.layernorm(x) , self.layernorm(x) , self.layernorm(x))) #AN IDEA TO TRY OUT WITH 2 ATTENTIONS IN ENCODER
    x = x + self.dropout(self.ffwd(self.layernorm(x)))
    x = self.layernorm(x)
    return x

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self , n_enc: int , Encoder , InputEmbedding , PositionalEncoding , NoMaskAttention , FeedForward , d_model: int , dropout: float):
    super().__init__()
    self.n_enc = n_enc
    self.encoder = nn.ModuleList([Encoder(InputEmbedding , PositionalEncoding , NoMaskAttention , FeedForward , d_model , dropout) for _ in range(n_enc)])

  def forward(self , x):
    for encoder in self.encoder:
      return encoder(x)

In [None]:
x = torch.randint(1 , 25 , (batch_size , seq_len) , dtype=torch.int32)
transformer_encoder_block = EncoderBlock(n_enc , Encoder , InputEmbedding , PositionalEncoding , NoMaskAttention , FeedForward , d_model=512 , dropout=0.1)

In [None]:
#Decoder portion
class Decoder(nn.Module):
  def __init__(self , InputEmbedding , PositionalEncoding , Masked_Attention , FeedForward ,ProjectionLayer,  d_model: int , dropout: float):
    super().__init__()
    self.input_embedding_layer = InputEmbedding(vocab_size , d_model)
    self.positional_encoding_layer = PositionalEncoding(d_model , seq_len , dropout=0.1)
    self.attention = Masked_Attention(heads=8 , d_model=512 , dropout=0.1)
    self.ffwd = FeedForward(d_model , d_ff , dropout=0.1)
    self.dropout = nn.Dropout(dropout)
    self.layernorm = nn.LayerNorm(d_model)
    self.projection_layer = ProjectionLayer(d_model , vocab_size)

  def forward(self , x , encoder_output):
    x = self.input_embedding_layer(x)
    x = self.positional_encoding_layer(x)
    x = x + self.dropout(self.attention(self.layernorm(x) , self.layernorm(x) , self.layernorm(x)))
    x = x + self.dropout(self.attention(self.layernorm(x) , self.layernorm(encoder_output) , self.layernorm(encoder_output)))
    x = x + self.dropout(self.ffwd(self.layernorm(x)))
    x = self.projection_layer(x)
    return x

In [None]:
transformer_decoder = Decoder(InputEmbedding , PositionalEncoding , Masked_Attention , FeedForward , ProjectionLayer , d_model=512 , dropout=0.1)

In [None]:
sample_logits = transformer_decoder(x , transformer_encoder_block(x))


In [None]:


n_enc_2 = 50
class Transformer(nn.Module):
  def __init__(self , EncoderBlock , Decoder):
    super().__init__()
    self.transformer_encoder = EncoderBlock(n_enc_2 , Encoder , InputEmbedding , PositionalEncoding , NoMaskAttention , FeedForward , d_model=512 , dropout=0.1)
    self.transformer_decoder = Decoder(InputEmbedding , PositionalEncoding , Masked_Attention , FeedForward , ProjectionLayer , d_model=512 , dropout=0.1)

  def forward(self , input_sen , target_sen):
    encoder_output = self.transformer_encoder(input_sen)
    logits =  self.transformer_decoder(input_sen , encoder_output)
    b , t , c = logits.shape
    loss = F.cross_entropy(logits.view(b*t , c) , target_sen.view(b*t))
    return logits , loss
  
transformer = Transformer(EncoderBlock , Decoder)

In [None]:
sample_sentence = torch.randint(1 , 25 , (batch_size , seq_len) , dtype=torch.long)
target = torch.randint(1,25 , (batch_size , seq_len) , dtype=torch.long)
transformer(sample_sentence , target)

**LET'S TRAIN THE MODEL NOW**

In [None]:
eval_iters = 100
eval_interval = 100

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  for split in ['train' , 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      x , y = get_batch(split)
      logits ,loss = transformer(x,y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  return out 

In [None]:
estimate_loss()

**LET'S GENERATE FROM THE MODEL**

In [None]:
sample_test = "        "
encoded = encode(sample_test)
encoded = torch.tensor(encoded , dtype=torch.long)
encoded = encoded.view(1 , seq_len)
encoded.shape

In [None]:
input_ids = encoded
memory = encoded

In [None]:
print(sum(p.numel() for p in transformer.parameters())/1e6, 'M parameters')

In [None]:
max_iters = 50000

In [None]:
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-3)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits ,loss = transformer(xb , yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
max_iters = 150000

In [None]:
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-4)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits ,loss = transformer(xb , yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-5)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits ,loss = transformer(xb , yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
for _ in range(1000):
    logits , loss = transformer(input_ids , input_ids)
    tr_dict = []


    #next_token = torch.argmax(logits[:, -1, :])
    logits = logits[: , -1 , :]
    probs = F.softmax(logits , dim=-1)
    next_token = torch.multinomial(probs , num_samples=1)

    input_ids = torch.cat([input_ids, next_token], dim=-1)
    memory = torch.cat([memory, next_token], dim=-1)
    for i in range(input_ids.shape[1]-1):
        tr_dict.append(input_ids[0][i+1].item())
  
    input_ids = torch.tensor(tr_dict , dtype=torch.long).view(1,-1)

In [None]:
generated_text = memory[0].tolist()
decode(generated_text)

In [None]:
print("Model's state_dict:")
for param_tensor in transformer.state_dict():
    print(param_tensor, "\t", transformer.state_dict()[param_tensor].size())

In [None]:
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

In [None]:
for param in transformer.parameters():
    print(param)