In [22]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader
import math

In [36]:
proccessedData = torch.load("../dataset/Data/preproccessedData.pt")
proccessedDataMASK = torch.load("../dataset/Data/preproccessedDataMASKS.pt")

In [40]:
proccessedData.shape

torch.Size([146521, 512])

In [38]:
proccessedData[19]

tensor([2819, 2046, 7071, 1856, 6311,  362, 2046, 7071, 1856, 6311, 6726, 1346,
        2710, 1906, 4736,  543,  803, 7812, 5361,  897, 2651,  264,   21, 4978,
        7031, 7957,  865, 5286, 7278,  467, 4263, 2001, 1978,  500, 1357, 1978,
        1079,  273,  891,  371, 1111, 6972, 2993, 2403, 9037, 2698, 1338,  741,
        1065,  596,  311,  506, 2361, 1944,  356,  475,  539, 2710,  909, 9690,
        1734,  974, 7699, 7468, 1561, 2094,  528, 6888, 1750, 3458,  304,  715,
        2447, 6957,  344, 6927, 1604,  879,  896,  521,  281,  344,  302,  295,
        5598,  466,  544, 3107,  714, 1945, 3466, 1585, 5654,  344, 1518,  413,
         489,  821,  287,  490, 7691,  401,  311, 2260, 1937, 1011, 4862,  418,
         347,  645,  267,  734, 2260,  480,  514,  296, 2885,  589, 1054,  463,
         401, 1791,  953, 1563, 1691,  309, 2476,  401, 5449,  281,  304, 2321,
        9823, 1128,  522, 2378,  290,  605,  349,  694,  758, 1021,  281,  358,
         549,  290,  928,  285,  358, 48

In [39]:
pe = torch.zeros(4,10)
print(pe)
position = torch.arange(0,4,dtype=torch.float).unsqueeze(1)
print(position)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[0.],
        [1.],
        [2.],
        [3.]])


In [41]:
class Blocks(nn.Module):
    def __init__(self,d_model,n_head,dropoutRate):
        super().__init__()
        headsize = d_model // n_head
        self.sa_head = nn.MultiheadAttention(embed_dim=d_model,num_heads=n_head,batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(d_model,4 * d_model),
            nn.ReLU(),
            nn.Linear(d_model * 4,d_model)
        )

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropoutRate)

    def forward(self,x,mask=None,padding=None):
        #output,weight (i ignored weights)
        att_out, _ = self.sa_head(self.ln1(x),self.ln1(x),self.ln1(x),
                             attn_mask=mask,
                             key_padding_mask=padding)
        x = x + self.dropout(att_out)

        x = x + self.dropout(self.ffn(self.ln2(x)))

        return x


In [42]:
class decoderNextWordPrediction(nn.Module):
    def __init__(self,vocab_size,d_model,block_size,n_heads,n_layers,dropRate):
        super().__init__()
        self.d_model = d_model
        self.toke_embedding_table = nn.Embedding(vocab_size,d_model)
        pe = torch.zeros(block_size,d_model)
        position = torch.arange(0,block_size,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer("pe",pe)

        self.dropout = nn.Dropout(dropRate)

        self.blocks = nn.ModuleList([Blocks(d_model,n_heads,dropoutRate=dropRate) for _ in range(n_layers)])

        self.ln_final = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model,vocab_size)

        self.register_buffer("mask",torch.triu(torch.ones(block_size,block_size),diagonal=1).bool())

    def forward(self,idx,padding=None):
        B,T = idx.shape

        x = self.toke_embedding_table(idx) * math.sqrt(self.d_model)

        x = x + self.pe[:T,:]

        x = self.dropout(x)

        current_mask = self.mask[:T,:T]

        for block in self.blocks:
            x = block(x,mask=current_mask,padding=padding)

        x = self.ln_final(x)
        logits = self.lm_head(x)


        return logits

In [43]:
configuration_hyperParameters = {
    "batch_size": 64,
    "learning_rate":3e-4,
    "epochs":1,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

configuration_Model = {
    "block_size":proccessedData.shape[1],
    "layers":5,
    "heads":4,
    "d_model":128,
    "vocab_size":10000,
    "dropRate":0.1
}



In [44]:
split_idx = int(0.95 * len(proccessedData))
train_ds = TensorDataset(proccessedData[:split_idx], proccessedDataMASK[:split_idx])
val_ds = TensorDataset(proccessedData[split_idx:], proccessedDataMASK[split_idx:])

train_loader = DataLoader(train_ds, batch_size=configuration_hyperParameters["batch_size"], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=configuration_hyperParameters["batch_size"])

In [45]:
model = decoderNextWordPrediction(configuration_Model["vocab_size"],configuration_Model["d_model"],
                                  configuration_Model["block_size"],
                                  configuration_Model["heads"],
                                  configuration_Model["layers"],
                                  configuration_Model["dropRate"]
                                  )
model = model.to(configuration_hyperParameters["device"])
optimizer = torch.optim.AdamW(model.parameters(),configuration_hyperParameters["learning_rate"])
lossFN = nn.CrossEntropyLoss(ignore_index=3)
print("number of total parameters:", sum(x.numel() for x in model.parameters()))


number of total parameters: 3561616


In [49]:
scaler = torch.amp.GradScaler("cuda")
for epoch in range(configuration_hyperParameters["epochs"]):
    
    model.train()
    
    for batchINDEX, (batch,masks) in enumerate(train_loader):
        batch = batch.to(configuration_hyperParameters["device"])
        
        masks = masks.to(configuration_hyperParameters["device"])
        
        inputs = batch[:,:-1]
        target = batch[:,1:]

        input_masks = (masks[:, :-1] == 0)

        with torch.amp.autocast("cuda"):
            
            logits = model(inputs,input_masks)
            B, T, C = logits.shape
            logits = logits.reshape(B * T, C)
            target = target.reshape(B * T)
            loss = lossFN(logits, target)
            

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        
        print("B1")
        
        if batchINDEX % 100 == 0:
            print(f"Epoch {epoch+1} | Batch {batchINDEX} | Loss: {loss.item():.4f}")
    model.eval() 
    total_val_loss = 0
    with torch.no_grad(), torch.amp.autocast("cuda"):
        for v_batch, v_masks in val_loader:
            v_batch, v_masks = v_batch.to(configuration_hyperParameters["device"]), v_masks.to(configuration_hyperParameters["device"])
            v_logits = model(v_batch[:,:-1], (v_masks[:,:-1] == 0))
            v_loss = lossFN(v_logits.reshape(-1, v_logits.size(-1)), v_batch[:,1:].reshape(-1))
            total_val_loss += v_loss.item()

    print(f"Epoch {epoch+1} , Val Loss: {total_val_loss/len(val_loader):.4f}")



    
torch.save(model.state_dict(), "my_decoder_model.pth")

B1
Epoch 1 | Batch 0 | Loss: 6.4929
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
Epoch 1 | Batch 100 | Loss: 6.4050
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
Epoch 1 | Batch 200 | Loss: 6.2789
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1
B1


In [50]:

from transformers import AutoTokenizer
retrivedTokenizer = AutoTokenizer.from_pretrained("../dataset/preproccessingDataCodes/TOKENIZER/")

In [52]:
model.eval() 
context_length = configuration_Model["block_size"]
idx = torch.tensor(retrivedTokenizer.encode("خلال ")).unsqueeze(0).to(configuration_hyperParameters["device"]) # (1, T)

with torch.no_grad():
    for _ in range(50): 
       
        idx_cond = idx[:, -context_length:] 
        
        
        logits = model(idx_cond)
        
        
        last_token_logits = logits[:, -1, :] 
        
      
        next_token = torch.argmax(last_token_logits, dim=-1).unsqueeze(1)
        
       
        idx = torch.cat((idx, next_token), dim=1)

print(retrivedTokenizer.decode(idx[0].tolist()))

خلال گونة في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم في العالم


In [18]:
# Load YOUR custom tokenizer
ids = retrivedTokenizer.encode(" الذهب")
decoded = retrivedTokenizer.decode(ids)
print(ids)
print("Decoded Text:", decoded)

[819]
Decoded Text:  الذهب
