##Imports

In [None]:
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from torch import nn
import pandas as pd
import numpy as np
import random
import torch
import json
import math
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE=64
HIDDEN_SIZE=512*2
HEAD_NUMBERS=8

##Preproccess Data

In [None]:
def string_preprocess(sen:str):

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()

###Tokinizer

In [None]:
class Tokinizer():
  def __init__(self,df,max_sequance_length=None,vocab_size_limit=None):
    self.max_sequance_lenght=max_sequance_length
    self.longets_string_len=0

    self.vocab_size_limit=vocab_size_limit


    if self.max_sequance_lenght!=None:
      self.longets_string_len=self.max_sequance_lenght

    self.vocabolary=dict({"MASK":0,"UNK":1,"SOS":2,"EOS":3})
    self.df=df

    self._fit_vocabalary_on_dataset()

  def __len__(self) -> int:
    return len(self.vocabolary)

  def _add_word_to_vocabalary_dict(self)->None:

    vocabolary=dict()
    for array in self.df:
      string_split_to_array=array.split()

      for word in string_split_to_array:
        if word not in vocabolary:
          vocabolary[word]=1
        else:
          vocabolary[word]+=1

    most_used_words_in_order=self._get_most_used_words_in_dict(vocabolary)
    for i in range(self.vocab_size_limit-4):
      try:
        self.vocabolary[list(most_used_words_in_order.keys())[i]]=4+i
      except IndexError:
        break

  def _get_most_used_words_in_dict(self,vocabolary):
    return {k: v for k, v in sorted(vocabolary.items(),reverse=True, key=lambda item: item[1])}

  def _find_lenght_of_longest_string(self,string)->None:
      if self.max_sequance_lenght==None:
        if len(string)>self.longets_string_len:
          self.longets_string_len=len(string)

  def _fit_vocabalary_on_dataset(self)->None:
    for string in self.df:
      self._find_lenght_of_longest_string(string)

    self._add_word_to_vocabalary_dict()

  def __getitem__(self,idx):
    return list(tokinizer.vocabolary.keys())[list(tokinizer.vocabolary.values()).index(idx)]

  def string_to_token(self,string) -> torch.tensor:
    encode=[]
    encode.append(self.vocabolary["SOS"])
    for word in string.split():

      if len(encode)==self.longets_string_len-1:
        break
      if word not in self.vocabolary.keys():
        encode.append(self.vocabolary["UNK"])
      else:
        encode.append(self.vocabolary[str(word)])


    encode.append(self.vocabolary["EOS"])

    padding=self.longets_string_len-len(encode)

    for pad in range(padding):
      encode.append(self.vocabolary["MASK"])
    return encode

  def token_to_string(self,token_tensor:torch.tensor) -> str:
    string_decode=""
    tensor_dimensions=len(token_tensor.shape)
    if tensor_dimensions>1:
      assert  "tensor has to many dimensions. try calling multi_diminsonal_tensor_detokinizer insted"
    for token in token_tensor.numpy():
      if token==self.vocabolary["SOS"]:
        pass
      elif token==self.vocabolary["EOS"]:
        break
      else:

       string_decode="{} {}".format(string_decode,list(self.vocabolary.keys())[list(self.vocabolary.values()).index(token)])

    return string_decode[1:]

  def multi_diminsonal_tensor_detokinizer(self, token_tensor)->list:
    tensor_dimensions=len(token_tensor.shape)

    if tensor_dimensions==2:
      two_dimensional_array_of_strings=[]
      for token_sequnace in token_tensor:
        two_dimensional_array_of_strings.append([self.token_to_string(token_sequnace)])
      return two_dimensional_array_of_strings

    if tensor_dimensions==3:
      three_dimensional_array_of_strings=[]

      for two_dimensional_tensor in token_tensor:
        two_dimensonal_array_of_strings=[]
        for token_sequance in two_dimensional_tensor:
          two_dimensonal_array_of_strings.append([self.token_to_string(token_sequance)])
        three_dimensional_array_of_strings.append(two_dimensonal_array_of_strings)

      return three_dimensional_array_of_strings


In [None]:
df=pd.read_csv("/content/drive/MyDrive/preTrainingData.csv")

input=[string_preprocess(str(i)) for i in df["input"]]
tokinizer=Tokinizer(input,max_sequance_length=30,vocab_size_limit=2048)

###Dataset

In [None]:
class promptDataset(Dataset):
  def __init__(self,path:str,tokinizer):
    df=pd.read_csv(path)

    df_question=[string_preprocess(str(i)) for i in df["input"]]
    df_answer=[string_preprocess(str(i)) for i in df["target"]]

    self.question_dataset_tensor=torch.tensor([tokinizer.string_to_token(string) for string in df_question])
    self.answer_dataset_tensor=torch.tensor([tokinizer.string_to_token(string) for string in df_answer])


  def __len__(self):
    return len(self.question_dataset_tensor)

  def __getitem__(self,idx):
    return self.question_dataset_tensor[idx],self.answer_dataset_tensor[idx]

###Create Datasets/Dataloaders

In [None]:
pretrain=True

In [None]:
if pretrain==True:
  pretraining_dataset=promptDataset("/content/drive/MyDrive/preTrainingData.csv",tokinizer)

  print(len(pretraining_dataset))
  train_size = int(0.8 * len(pretraining_dataset))
  test_size = len(pretraining_dataset) - train_size
  train_dataset, test_dataset = torch.utils.data.random_split(pretraining_dataset, [train_size, test_size])

  pretrainingTestDataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)
  pretrainingTrainDataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)

else:
  dataset=promptDataset("/content/drive/MyDrive/shitpostCommentData.csv",tokinizer)

  train_size = int(0.8 * len(dataset))
  test_size = len(dataset) - train_size
  train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

  test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)
  train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)

25690


In [None]:
def testDataloader(dataloader,sample):
  for input,target in dataloader:
    print(tokinizer.token_to_string(input[sample]))
    print(tokinizer.token_to_string(target[sample]))
# testDataloader(pretrainingTestDataloader,1)

##Model

###Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self ,input_sequance_length:int,head_num, vocabalary_size:int,hidden_size:int) -> None:
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      InputLayer(vocabalary_size,input_sequance_length),
      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),

      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
    )

  def forward(self,input:torch.LongTensor):

    return self.sequentialBlock(input)


###Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self ,input_sequance_length:int, head_num,vocabalary_size:int ,hidden_size:int) -> None:
    super().__init__()

    self.inputLayer=InputLayer(vocabalary_size,input_sequance_length)

    self.sequential=nn.Sequential(
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),

        DecoderBlock(input_sequance_length,head_num,False,hidden_size, input_sequance_length),
    )
    self.output=nn.Linear(input_sequance_length,vocabalary_size)

  def forward(self,target:torch.LongTensor, encoder_output):

    posisonalEmbedding=self.inputLayer(target)


    sequential=self.sequential({posisonalEmbedding,encoder_output})

    return F.softmax(self.output(sequential),-1)

###Sub layers

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self ,input,head_num,  hidden_size, output):
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      MultiHeadedAttention(input ,head_num, None, hidden_size,input),
      FeedForward(input ,hidden_size ,output),
    )

  def forward(self,input):
    return self.sequentialBlock(input)


In [None]:
class DecoderBlock(nn.Module):
  def __init__(self ,input,head_num,return_encoder_output,  hidden_size, output):
    super().__init__()
    self.return_encoder_output=return_encoder_output
    self.maskedMultiHeadedAttention=MultiHeadedAttention(input ,head_num, None, hidden_size,input)

    self.multiHeadedAttention=MultiHeadedAttention(input ,head_num, None, hidden_size,input)
    self.feedForwardBlock2=FeedForward(input ,hidden_size ,output)

  def forward(self,inputData):
    input,encoder_output=inputData
    maskedMultiHeadedAttention=self.maskedMultiHeadedAttention(input)

    multiHeadedAttention=self.multiHeadedAttention(maskedMultiHeadedAttention,encoder_output)

    if self.return_encoder_output==True:
      return self.feedForwardBlock2(multiHeadedAttention),encoder_output

    return self.feedForwardBlock2(multiHeadedAttention)


In [None]:
class InputLayer(nn.Module):
  def __init__(self,input_size,output_size):
    super().__init__()
    self.embedding=nn.Embedding(input_size,output_size)
    self.posisonalEncoding=nn.Embedding(input_size,output_size)

  def forward(self,input):
    try:
      batch,squanceLength=input.shape
    except ValueError:
      input=input.unsqueeze(0)
    finally:
      batch,squanceLength=input.shape

    embedding=self.embedding(input)
    posisonalEmbedding=self.posisonalEncoding(torch.arange(squanceLength).to(device))

    return embedding+posisonalEmbedding

In [None]:
class FeedForward(nn.Module):
  def __init__(self,input_size,hidden_size,output_size):
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      nn.Linear(input_size,hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size,output_size),
    )
    self.norm=nn.LayerNorm(output_size)


  def forward(self,input):
    output=self.sequentialBlock(input)
    return self.norm(output+input)

In [None]:
class MultiHeadedAttention(nn.Module):
  def __init__(self,input_size,head_num,mask,hidden_size, output_size):
    super().__init__()

    self.attention=AttentionMechanism(input_size,head_num,mask,hidden_size, output_size)
    self.norm=nn.LayerNorm(output_size)

  def forward(self,input,encoder_output=None):
    attn=self.attention(input,encoder_output)
    output=self.norm(input+attn)

    return output


###Attention Mechanism

In [None]:
class AttentionMechanism(nn.Module):
  def __init__(self,input_size,head_num,mask,hidden_size, output_size):
    super().__init__()
    self.head_num=head_num
    self.hidden_size=hidden_size
    self.mask=mask

    self.query=nn.Linear(input_size,hidden_size)
    self.key=nn.Linear(input_size,hidden_size)
    self.value=nn.Linear(input_size,hidden_size)

    self.output=nn.Linear(hidden_size,output_size)


  def forward(self,input,encoder_output=None):
    batch_size,seq_len,_=input.shape

    if encoder_output!=None:
      query=self.query(input)
      key=self.key(encoder_output)
      value=self.value(encoder_output)

    else:
      query=self.query(input)
      key=self.key(input)
      value=self.value(input)

    query=query.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)
    key=key.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)
    value=value.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)

    attn_value=self.calculateScaledDotProduct(query,key,value)

    attn_weight=attn_value.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)

    return self.output(attn_weight)


  def calculateScaledDotProduct(self,query,key,value):
    attention_weight=torch.matmul(query,key.permute(0,1,3,2))
    dk=key.shape[-1]
    scaled_attention_logits=attention_weight/dk
    if self.mask!=None:
      scaled_attention_logits += (self.mask * -1e9)
    return torch.matmul(F.softmax(scaled_attention_logits,-1),value)

  def calculateScore(self,query,key):
    return torch.dot(query,key)

# model=AttentionMechanism(1024,4,1024,1024)
# x=model(torch.rand(1,512,1024))

###Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device, target_vocab_size):
    super().__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device=device
    self.target_vocab_size=target_vocab_size

  def forward(self,input:torch.LongTensor, target:torch.LongTensor,softmax=False):
    encoderOutput=self.encoder(input)

    decoderOutput=self.decoder(target,encoderOutput)

    if softmax==True:
      return F.softmax(decoderOutput,-1)
    return decoderOutput

  def generate_tokens(self, input, start_token, end_token, max_length=512):
    batch_size, seq_len=input.shape

    encoder_output=self.encoder(input)


    target_sequence=torch.tensor([start_token]*max_length).long().to(device)

    for i in range(max_length):

      decoder_output=self.decoder(target_sequence,encoder_output)

      next_token_probs=F.softmax(decoder_output,-1)
      next_token_index = torch.multinomial(next_token_probs[:, -1, :], num_samples=1)
      next_token = next_token_index.item()

      target_sequence[i]=torch.tensor([next_token]).long().to(device)
      if next_token==end_token:
        break
    return target_sequence


TOKINIZER_VOCAB=len(tokinizer)

encoder=Encoder(tokinizer.longets_string_len, HEAD_NUMBERS, TOKINIZER_VOCAB, HIDDEN_SIZE).to(device)
decoder=Decoder(tokinizer.longets_string_len, HEAD_NUMBERS, TOKINIZER_VOCAB, HIDDEN_SIZE).to(device)

seq2seq=Seq2Seq(encoder,decoder,device,TOKINIZER_VOCAB).to(device)

# seq2seq



In [None]:
def make_prediction(string:str):
  input_sequence=tokinizer.string_to_token(string)
  input_sequence=torch.tensor([input_sequence])
  generated_tokens = seq2seq.generate_tokens(input_sequence.to(device), 2, 3,tokinizer.longets_string_len)
  print(len(generated_tokens))
  return tokinizer.token_to_string(generated_tokens.squeeze(0).cpu())
pred=make_prediction("premiering adolf hitler starring samuel jackson as adolf hilte")
print(pred)

30
characterized commission felt event face commission prowess movie religious evidence tv interest labor growing late release parks transgender left respectively content photos today support critical even australia obtained california school


##Optimizer & Loss


In [None]:
optimizer=torch.optim.Adam(seq2seq.parameters(),lr=0.001)
loss=nn.CrossEntropyLoss(ignore_index=0)

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 3,328,652 trainable parameters


##Training Loop

###Training Util

In [None]:
class TrainingUtil():

  def accuracy(self,predictions,targets):
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match."

    num_correct = (predictions == targets).sum().item()

    total_samples = targets.numel()
    accuracy_value = num_correct / total_samples
    return accuracy_value*100

  def getLossAndAccuracy(self,prediction,target):
    prediction=prediction.to(self.device)
    target=target.to(self.device)

    prediction_loss=self.loss(prediction.view(-1,prediction.shape[-1]),target.view(-1))
    prediction_acc=self.accuracy(prediction.argmax(2),target)

    return prediction_loss,prediction_acc

  def make_prediction(self,input,target)->float:
    input=input.to(self.device)
    target=target.to(self.device)
    prediction=self.model(input,target)
    return prediction



###Training

In [None]:
class TrainNN(TrainingUtil):
  def __init__(self, EPOCHS ,model,loss ,device ,save_path ,tokinizer, train_dataloader, test_dataloader):
    self.EPOCHS=EPOCHS
    self.model=model
    self.loss=loss
    self.device=device
    self.save_path=save_path
    self.tokinizer=tokinizer
    self.train_dataloader=train_dataloader
    self.test_dataloader=test_dataloader

    self.start_model_training_loop()

  def start_model_training_loop(self)->None:
    for epoch in tqdm(range(self.EPOCHS)):
      train_state=self.training_loop()
      train_state=next(iter(train_state))
      train_loss,train_acc=train_state[0],train_state[1]

      test_state=self.testing_loop()
      test_state=next(iter(test_state))
      test_loss,test_acc=test_state[0],test_state[1]

      torch.save(self.model.state_dict(), f"/content/drive/MyDrive/{self.save_path}.pth")
      print(f"\n epoch: {epoch} | train_loss: {train_loss:.2f}, train_acc: {train_acc:.1f}% | test_loss: {test_loss:.2f}, test_acc: {test_acc:.1f}%")

  def training_loop(self):
    self.model.train()
    for input,target in self.train_dataloader:
      optimizer.zero_grad()
      prediction=self.make_prediction(input,target)
      train_loss,train_acc=self.getLossAndAccuracy(prediction,target)

      train_loss.backward()
      optimizer.step()
      yield train_loss,train_acc

  def testing_loop(self):
    self.model.eval()
    with torch.inference_mode():
      for input,target in self.test_dataloader:

        prediction=self.make_prediction(input,target)
        test_loss,test_acc=self.getLossAndAccuracy(prediction,target)
        # print("prediction: ",prediction.argmax(2)[0])

        yield test_loss,test_acc


###Pre Training

In [None]:
class PreTraining(TrainingUtil):
  def __init__(self, EPOCHS, model, loss, device, savePath, tokinizer,train_dataloader,test_dataloader):
    self.EPOCHS=EPOCHS
    self.loss=loss
    self.model=model
    self.device=device
    self.savePath=savePath
    self.tokinizer=tokinizer
    self.train_dataloader=train_dataloader
    self.test_dataloader=test_dataloader
    self.startPreTraining()

  def startPreTraining(self):
    for epoch in range(EPOCHS):
      train_state=self.trainingLoop()
      train_state=next(iter(train_state))
      train_loss,train_acc=train_state[0],train_state[1]

      test_state=self.testingLoop()
      test_state=next(iter(test_state))
      test_loss,test_acc=test_state[0],test_state[1]

      torch.save(self.model.state_dict(), f"/content/drive/MyDrive/{self.savePath}.pth")
      print(f"epoch: {epoch} | loss: {train_loss:.2f}, acc: {train_acc:.1f}% | loss: {test_loss:.2f}, acc: {test_acc:.1f}%")

  def trainingLoop(self):
    self.model.train()
    for input,target in self.train_dataloader:
      optimizer.zero_grad()

      prediction=self.make_prediction(input,target)

      train_loss,train_acc=self.getLossAndAccuracy(prediction,target)

      train_loss.backward()
      optimizer.step()
      yield train_loss,train_acc

  def testingLoop(self):
    self.model.eval()

    with torch.inference_mode():
      for input,target in self.train_dataloader:

        prediction=self.make_prediction(input,target)
        print("prediction: ",prediction.argmax(2)[0])
        # print("input: ",input[0])
        # print("target: ",target[0])

        test_loss,test_acc=self.getLossAndAccuracy(prediction,target)

        yield test_loss,test_acc

###Start Training

In [None]:
EPOCHS=500

if pretrain==True:
  PreTraining(EPOCHS,seq2seq,loss,device,"preTrainedTransformer_3M",tokinizer,pretrainingTrainDataloader,pretrainingTestDataloader)
else:
  seq2seq.load_state_dict(torch.load("/content/drive/MyDrive/preTrainedTransformer_3M.pth",map_location=device))
  TrainNN(EPOCHS,seq2seq,loss,device,"transformer_model",tokinizer,train_dataloader,test_dataloader)

prediction:  tensor([1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908,
        1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908, 1908,  444,
        1908,    3, 1908, 1908, 1908, 1908], device='cuda:0')
epoch: 0 | loss: 7.62, acc: 0.0% | loss: 7.62, acc: 0.0%
prediction:  tensor([1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807,
        1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807, 1807,
        1807, 1807, 1807, 1807, 1807, 1807], device='cuda:0')
epoch: 1 | loss: 7.62, acc: 0.0% | loss: 7.62, acc: 0.0%
prediction:  tensor([3, 1, 3, 1, 1, 1, 1, 3, 1, 3, 3, 1, 1, 3, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3], device='cuda:0')
epoch: 2 | loss: 7.62, acc: 0.0% | loss: 7.62, acc: 10.4%
prediction:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1], device='cuda:0')
epoch: 3 | loss: 7.62, acc: 11.1% | loss: 7.62, acc: 9.9%
prediction:  tensor([1, 1,

KeyboardInterrupt: ignored