##Imports

In [None]:
from torch.utils.data import Dataset,DataLoader
from dataclasses import dataclass
import torch.nn.functional as F
from typing import Union
from tqdm import tqdm
from torch import nn
import pandas as pd
import numpy as np
import random
import torch
import json
import math
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
BATCH_SIZE=32
HIDDEN_SIZE=512*2
HEAD_NUMBERS=8

##Preproccess Data

In [None]:
def string_preprocess(sen:str):

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()

###Tokinizer

In [None]:
class TokinizerUtils():
  def __init__(self,max_sequance_length,vocab_size_limit,longets_string_len=None):

    self.fit_dataset=[]
    self.longets_string_len=longets_string_len

    self.max_sequance_length=max_sequance_length
    self.vocab_size_limit=vocab_size_limit


    if self.max_sequance_length!=None:
      self.longets_string_len=self.max_sequance_length

  vocabolary=dict({"PAD":0,"UNK":1,"MASK":2,"SOS":3,"EOS":4})

  def _extract_data_from_pandas_csv(self,dataframe_paths:list):
    for dataframe_path in dataframe_paths:
      dataframe=pd.read_csv(dataframe_path)

      _,column_numbers=dataframe.shape
      for column_number in range(column_numbers):
        column=dataframe.iloc[:,column_number]
        [self.fit_dataset.append(str(i)) for i in column]

  def __len__(self) -> int:
    return len(self.vocabolary)

  def __getitem__(self,index_or_word)->Union[int,str]:
    if type(index_or_word)==int:
      return self._get_word_from_index(index_or_word)
    elif type(index_or_word)==str:
      return  self._get_index_from_word(index_or_word)
    else:
      raise TypeError("data type is not suported, make sure its a int or str")

  def _get_word_from_index(self,word):
    return list(self.vocabolary.keys())[list(self.vocabolary.values()).index(word)]

  def _get_index_from_word(self,idx):
    return list(self.vocabolary.keys()).index(idx)

  def _fit_vocabalary_on_dataset(self)->None:
    self._find_lenght_of_longest_string()

  def _find_lenght_of_longest_string(self)->None:
    for string in self.fit_dataset:
      if self.max_sequance_length==None:
        if len(string)>self.longets_string_len:
          self.longets_string_len=len(string)

  def fit(self)->None:
    self._find_lenght_of_longest_string()
    vocabolary=self._make_dict_of_dataset()

    most_used_words_in_order=self._sort_dict_by_most_word_count(vocabolary)
    self._add_words_to_dict_in_order(most_used_words_in_order)


  def _make_dict_of_dataset(self)->dict:
    vocabolary=dict()
    for array in self.fit_dataset:
      string_split_to_array=array.split()
      for word in string_split_to_array:
        if word not in vocabolary:
          vocabolary[word]=1
        else:
          vocabolary[word]+=1
    return vocabolary

  def _sort_dict_by_most_word_count(self,vocabolary):
    return {k: v for k, v in sorted(vocabolary.items(),reverse=True, key=lambda item: item[1])}

  def _add_words_to_dict_in_order(self,most_used_words_in_order)->None:
    for i in range(self.vocab_size_limit-len(self.vocabolary)):
      try:
        self.vocabolary[list(most_used_words_in_order.keys())[i]]=5+i
      except IndexError:
        break

In [None]:
class Tokinizer(TokinizerUtils):
  def __init__(self,max_sequance_length,vocab_size_limit,longets_string_len=None):
    super().__init__(max_sequance_length,vocab_size_limit,longets_string_len)

  def encode(self,string) -> torch.tensor:
    encode=[]
    encode.append(self.vocabolary["SOS"])
    encode=self._encode_string(string,encode)
    encode.append(self.vocabolary["EOS"])

    return self._pad_encoding(encode)

  def _encode_string(self,string,encode)->list:
    for word in string.split():
      if len(encode)==self.longets_string_len-1:
        break
      if word not in self.vocabolary.keys():
        encode.append(self.vocabolary["UNK"])
      else:
        encode.append(self.vocabolary[str(word)])
    return encode

  def _pad_encoding(self,encode):
    requredPaddingLength=self.longets_string_len-len(encode)
    for pad in range(requredPaddingLength):
      encode.append(self.vocabolary["PAD"])
    return encode

  def decode(self,token_tensor:torch.tensor) -> str:
    self._check_tensor_dimension(token_tensor)
    return self._decode_tensor(token_tensor)

  def _check_tensor_dimension(self,tensor):
    tensor_dimensions=len(tensor.shape)
    if tensor_dimensions>1:
      raise  ValueError(f"tensor has to many dimensions. expected 1 got {tensor_dimensions}")

  def _decode_tensor(self,tensor):
    string_decode=""

    for token in tensor.cpu().numpy():
      if token==self.vocabolary["SOS"]:
        pass
      elif token==self.vocabolary["EOS"]:
        break
      else:
       string_decode="{} {}".format(string_decode,list(self.vocabolary.keys())[list(self.vocabolary.values()).index(token)])

    return string_decode[1:]


tokinizerFitData=pd.read_csv("/content/drive/MyDrive/shitpostCommentData.csv")

input=[str(i) for i in tokinizerFitData["input"]]
tokinizer=Tokinizer(max_sequance_length=30,vocab_size_limit=6000)
tokinizer._extract_data_from_pandas_csv(["/content/drive/MyDrive/shitpostCommentData.csv","/content/drive/MyDrive/preTrainingData.csv"])
tokinizer.fit()

###Dataset

In [None]:
class promptDataset(Dataset):
  def __init__(self,path:str,tokinizer):
    df=pd.read_csv(path)

    df_question=[string_preprocess(str(i)) for i in df["input"]]
    df_answer=[string_preprocess(str(i)) for i in df["target"]]


    self.question_dataset_tensor=torch.tensor([tokinizer.encode(string) for string in df_question])
    self.answer_dataset_tensor=torch.tensor([tokinizer.encode(string) for string in df_answer])


  def __len__(self):
    return len(self.question_dataset_tensor)

  def __getitem__(self,idx):
    return self.question_dataset_tensor[idx],self.answer_dataset_tensor[idx]

###Create Datasets/Dataloaders

In [None]:
def testDataloader(dataloader,sample):
  for input,target in dataloader:
    print(tokinizer.token_to_string(input[sample]))
    print(tokinizer.token_to_string(target[sample]))
# testDataloader(pretrainingTestDataloader,1)

##Model

###Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self ,input_sequance_length:int,head_num, vocabalary_size:int,hidden_size:int) -> None:
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      InputLayer(vocabalary_size,input_sequance_length),
      EncoderBlock(input_sequance_length, head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length, head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length, head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length, head_num,  hidden_size, input_sequance_length),
      EncoderBlock(input_sequance_length, head_num,  hidden_size, input_sequance_length),

      EncoderBlock(input_sequance_length,head_num,  hidden_size, input_sequance_length),
      nn.Linear(input_sequance_length,vocabalary_size)
    )


  def forward(self,input:torch.LongTensor):
    # print("self.sequentialBlock(input): ",self.sequentialBlock(input).shape)


    return self.sequentialBlock(input)


###Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self ,input_sequance_length:int, head_num,vocabalary_size:int ,hidden_size:int) -> None:
    super().__init__()

    self.inputLayer=InputLayer(vocabalary_size,input_sequance_length)

    self.sequential=nn.Sequential(
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),
        DecoderBlock(input_sequance_length,head_num,True,hidden_size, input_sequance_length),

        DecoderBlock(input_sequance_length,head_num,False,hidden_size, input_sequance_length),
    )

    self.reshapeEncoder=nn.Linear(vocabalary_size,input_sequance_length)

    self.output=nn.Linear(input_sequance_length,vocabalary_size)

  def forward(self,target:torch.LongTensor, encoder_output):
    reshapeEncoder=self.reshapeEncoder(encoder_output)
    posisonalEmbedding=self.inputLayer(target)

    sequential=self.sequential({posisonalEmbedding,reshapeEncoder})

    return F.softmax(self.output(sequential),-1)

###Encoder/Decoder block

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, input_sequance_length, head_num,  hidden_size, output):
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      # MaskedLanguageModel(input_sequance_length, tokinizer, input_sequance_length),
      MultiHeadedAttention(input_sequance_length, head_num, 2, hidden_size, input_sequance_length),
      FeedForward(input_sequance_length, hidden_size, output),
    )

  def forward(self,input):
    return self.sequentialBlock(input)


In [None]:
class DecoderBlock(nn.Module):
  def __init__(self ,input,head_num,return_encoder_output,  hidden_size, output):
    super().__init__()
    self.return_encoder_output=return_encoder_output


    self.maskedMultiHeadedAttention=MultiHeadedAttention(input ,head_num, None, hidden_size,output)

    self.multiHeadedAttention=MultiHeadedAttention(input ,head_num, None, hidden_size,output)
    self.feedForwardBlock2=FeedForward(input ,hidden_size ,output)

  def forward(self,inputData):

    input,encoder_output=inputData

    maskedMultiHeadedAttention=self.maskedMultiHeadedAttention(input)

    multiHeadedAttention=self.multiHeadedAttention(maskedMultiHeadedAttention,encoder_output)
    if self.return_encoder_output==True:
      return self.feedForwardBlock2(multiHeadedAttention),encoder_output

    return self.feedForwardBlock2(multiHeadedAttention)


###Sub layers

In [None]:
class InputLayer(nn.Module):
  def __init__(self,input_size,output_size):
    super().__init__()
    self.embedding=nn.Embedding(input_size,output_size)
    self.posisonalEncoding=nn.Embedding(input_size,output_size)

  def forward(self,input):
    try:
      batch,squanceLength=input.shape
    except ValueError:
      input=input.unsqueeze(0)
    finally:
      batch,squanceLength=input.shape

    embedding=self.embedding(input)
    posisonalEmbedding=self.posisonalEncoding(torch.arange(squanceLength).to(device))

    return embedding+posisonalEmbedding

In [None]:
class FeedForward(nn.Module):
  def __init__(self,input_size,hidden_size,output_size):
    super().__init__()
    self.sequentialBlock=nn.Sequential(
      nn.Linear(input_size,hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size,output_size),
    )
    self.norm=nn.LayerNorm(output_size)


  def forward(self,input):
    output=self.sequentialBlock(input)
    return self.norm(output+input)

###Masked Languange Model

In [None]:
class MaskStringVectorWithPorbebility():
  def __init__(self, tokinizer, maskToken, wordChanceOfSelection=.15 ,wordChanceOfSwapWithMaskToken=.8, wordChanceOfSwapWithRandomToken=.1, wordChanceOfstayingTheSame=.1):
    self.tokinizer=tokinizer
    self.maskToken=maskToken
    self.wordChanceOfSelection=wordChanceOfSelection
    self.wordChanceOfSwapWithMaskToken=wordChanceOfSwapWithMaskToken
    self.wordChanceOfSwapWithRandomToken=wordChanceOfSwapWithRandomToken
    self.wordChanceOfstayingTheSame=wordChanceOfstayingTheSame

  def _batchedDataset(self,inputData):
    batchedMaskedDataset=[]
    batchSize,SequanceLength=inputData.shape
    for batchNumber in range(batchSize):
      VectorSequance=inputData[batchNumber]
      batchedMaskedDataset.append(self._maskSequanceVector(VectorSequance))

    return batchedMaskedDataset

  def _maskSequanceVector(self,VectorSequance):
    vectorAfterMaksingMechanisam=[]
    for idx in range(len(VectorSequance)):
      if random.random()<self.wordChanceOfSelection:
        token=self._selectWordWithProbAndModify(idx,VectorSequance)
        vectorAfterMaksingMechanisam.append(token)
      else:
        vectorAfterMaksingMechanisam.append(VectorSequance[idx])

    return vectorAfterMaksingMechanisam

  def _selectWordWithProbAndModify(self,idx,sentanceInVectorForm):
    if random.random()<self.wordChanceOfSwapWithMaskToken:
      return self.maskToken
    elif random.random()<self.wordChanceOfSwapWithRandomToken:
      randomWord=random.randrange(len(self.tokinizer))
      return randomWord
    elif random.random()<self.wordChanceOfstayingTheSame:
      return sentanceInVectorForm[idx]
    else:
      return sentanceInVectorForm[idx]

  def maskSentance(self,inputData : torch.tensor):
    inputData=inputData.cpu().detach().numpy()
    if len(inputData.shape)==2:
      return torch.Tensor(self._batchedDataset(inputData))
    elif len(inputData.shape)==1:
      return torch.Tensor(self._maskSequanceVector(inputData))
    else:
      raise ValueError(f"expected data batch size to be 1D or 2D but resived {len(inputData.shape)}D")

In [None]:
class MaskedLanguageModel(nn.Module):
  def __init__(self,input,tokinizer,output):
    super().__init__()
    self.maskingMechanism=MaskStringVectorWithPorbebility(tokinizer,2)
    self.tokenEmbedding=nn.Embedding(input,output)
    self.positonalEmbedding=nn.Embedding(input,output)
    self.languageEmbedding=nn.Embedding(input,output)

  def forward(self,input):
    wordPorbilityToWord=input.argmax(-1)
    inputAfterMaskedMechanism=self.maskingMechanism.maskSentance(wordPorbilityToWord)


    tokenEmbedding=self.tokenEmbedding(wordPorbilityToWord)
    positonalEmbedding=self.positonalEmbedding(wordPorbilityToWord)
    languageEmbedding=self.languageEmbedding(wordPorbilityToWord)

    embeddings=languageEmbedding+positonalEmbedding+tokenEmbedding


    return F.softmax(embeddings,-1)

###Attention Mechanism

In [None]:
class MultiHeadedAttention(nn.Module):
  def __init__(self,input_size,head_num,mask,hidden_size, output_size):
    super().__init__()
    self.attention=AttentionMechanism(input_size,head_num,mask,hidden_size, output_size)
    self.norm=nn.LayerNorm(output_size)

  def forward(self,input,encoder_output=None):
    if encoder_output!=None:
      attn=self.attention(input,encoder_output)
    else:
      attn=self.attention(input)

    output=self.norm(input+attn)

    return output


In [None]:
class AttentionMechanism(nn.Module):
  def __init__(self,input_size,head_num,mask,hidden_size, output_size):
    super().__init__()
    self.head_num=head_num
    self.hidden_size=hidden_size
    self.mask=mask

    self.query=nn.Linear(input_size,hidden_size)
    self.key=nn.Linear(input_size,hidden_size)
    self.value=nn.Linear(input_size,hidden_size)

    self.output=nn.Linear(hidden_size,output_size)


  def forward(self,input,encoder_output=None):
    batch_size,seq_len,_=input.shape


    if encoder_output!=None:
      query=self.query(input)
      key=self.key(encoder_output)
      value=self.value(encoder_output)

    else:
      query=self.query(input)
      key=self.key(input)
      value=self.value(input)



    query=query.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)
    key=key.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)
    value=value.view(batch_size, seq_len, self.head_num, self.hidden_size // self.head_num).transpose(1, 2)

    attn_value=self.calculateScaledDotProduct(query,key,value)

    attn_weight=attn_value.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)


    return self.output(attn_weight)


  def calculateScaledDotProduct(self,query,key,value):
    scores=self.calculateVectorSimilaritis(query.transpose(2, 3),key)
    dk=key.shape[-1]
    scaled_attention_logits=scores/dk
    if self.mask!=None:
      scaled_attention_logits += (2 * -1e9)
    attn_wight=F.softmax(scaled_attention_logits, dim=-1)

    return torch.matmul(attn_wight,value.transpose(2, 3))

  def calculateVectorSimilaritis(self,query,key):
    return torch.matmul(query,key)

###Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device, target_vocab_size):
    super().__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device=device
    self.target_vocab_size=target_vocab_size

  def forward(self,input:torch.LongTensor, target:torch.LongTensor,softmax=False):
    encoderOutput=self.encoder(input)

    decoderOutput=self.decoder(target,encoderOutput)

    if softmax==True:
      return F.softmax(decoderOutput,-1)
    return decoderOutput

  def generate_tokens(self, input, start_token, end_token, max_length=512):
    batch_size, seq_len=input.shape

    encoder_output=self.encoder(input)

    target_sequence=torch.tensor([[[0]*5910]*max_length]).long().to(device)
    target_batch,target_seq_len,_=target_sequence.shape

    for batch in range(target_batch):
      for token_index in range(target_seq_len):

        decoder_output=self.decoder(target_sequence.argmax(-1),encoder_output)

        next_token_probs=F.softmax(decoder_output,-1)
        next_token_index = torch.multinomial(next_token_probs[:, -1, :], num_samples=5910)



        target_sequence[batch][token_index]=next_token_index.long().to(device)
        if next_token_index.argmax(-1)==end_token:
          break
    return target_sequence


TOKINIZER_VOCAB=len(tokinizer)

encoder=Encoder(tokinizer.longets_string_len, HEAD_NUMBERS, TOKINIZER_VOCAB, HIDDEN_SIZE).to(device)
# encoder.load_state_dict(torch.load("/content/drive/MyDrive/preTrainedencoder_2.pth",map_location=device))

decoder=Decoder(tokinizer.longets_string_len, HEAD_NUMBERS, TOKINIZER_VOCAB, HIDDEN_SIZE).to(device)

seq2seq=Seq2Seq(encoder,decoder,device,TOKINIZER_VOCAB).to(device)
# seq2seq.load_state_dict(torch.load("/content/drive/MyDrive/preTrainedTransformer_2.pth",map_location=device))
# seq2seq

In [None]:
def make_prediction(string:str):
  input_sequence=tokinizer.encode(string)
  input_sequence=torch.tensor([input_sequence])
  generated_tokens = seq2seq.generate_tokens(input_sequence.to(device), 3, 4,tokinizer.longets_string_len).argmax(-1)
  print(generated_tokens.shape)
  return tokinizer.decode(generated_tokens.squeeze(0).cpu())
pred=make_prediction("adolf")
print(pred)

torch.Size([1, 30])
smartest algonquian snoovatar intelligence borat yellow redirect puerto cyberpunk public indexing spun updated redditgifts Ssself block left. miracle woods now, usually coming constitution maya estimate campaign sam identify screen expectancy


##parameter count

In [None]:
def parametersCount(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The seq2seq model has {parametersCount(seq2seq):,} trainable parameters')
print(f'The encoder model has {parametersCount(encoder):,} trainable parameters')
print(f'The decoder model has {parametersCount(decoder):,} trainable parameters')

The seq2seq model has 4,272,354 trainable parameters
The encoder model has 1,669,386 trainable parameters
The decoder model has 2,602,968 trainable parameters


##Optimizer & Loss


In [None]:
optimizer=torch.optim.Adam(seq2seq.parameters(),lr=0.001)
loss=nn.CrossEntropyLoss(ignore_index=0)

##Training Loops

###Training Util

In [None]:
class TrainingUtil():
  def __init__(self, EPOCHS ,model,loss ,device ,savePath ,tokinizer, csvFilePath, testSplit):
    self.EPOCHS=EPOCHS
    self.model=model
    self.loss=loss
    self.device=device
    self.savePath=savePath
    self.tokinizer=tokinizer

    self.train_dataloader,self.test_dataloader=self.createDatasetFromPandasCsv(csvFilePath,testSplit)

    self.currentEpoch=0
    self.startPreTraining()

  def accuracy(self,predictions,targets):
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match."

    num_correct = (predictions == targets).sum().item()

    total_samples = targets.numel()
    accuracy_value = num_correct / total_samples
    return accuracy_value*100

  def getLossAndAccuracy(self,prediction,target):
    prediction=prediction.to(self.device)
    target=target.to(self.device).type(torch.int64)

    prediction_loss=self.loss(prediction.view(-1,prediction.shape[-1]),target.view(-1))
    prediction_acc=self.accuracy(prediction.argmax(2),target)

    return prediction_loss,prediction_acc

  def createDatasetFromPandasCsv(self,csvFilePath,testSplit):
    dataset=promptDataset(csvFilePath,tokinizer)
    print(len(dataset))
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    trainDataset, testDataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_dataloader=DataLoader(trainDataset,batch_size=BATCH_SIZE,shuffle=True)
    test_dataloader=DataLoader(testDataset,batch_size=BATCH_SIZE,shuffle=True)

    return train_dataloader, test_dataloader

  def make_prediction(self,input,target=None)->float:
    input=input.to(self.device)
    if target==None:
      return self.model(input)
    else:
      target=target.to(self.device)
      return self.model(input,target)


  def startPreTraining(self)->None:
    epochsToRun=self.EPOCHS+1
    for epoch in tqdm(range(1,epochsToRun)):
      self.currentEpoch=epoch

      train_state=self.trainingLoop()
      train_state=next(iter(train_state))
      train_loss,train_acc=train_state[0],train_state[1]

      test_state=self.testingLoop()
      test_state=next(iter(test_state))
      test_loss,test_acc=test_state[0],test_state[1]

      torch.save(self.model.state_dict(), f"/content/drive/MyDrive/{self.savePath}.pth")
      print(f"\n epoch: {epoch} | train_loss: {train_loss:.2f}, train_acc: {train_acc:.1f}% | test_loss: {test_loss:.2f}, test_acc: {test_acc:.1f}%")



###Training seq2seq

In [None]:
class TrainNN(TrainingUtil):
  def __init__(self,EPOCHS ,model,loss ,device ,savePath ,tokinizer, csvFilePath, testSplit):
    super().__init__(EPOCHS ,model,loss ,device ,savePath ,tokinizer, csvFilePath, testSplit)

  def trainingLoop(self):
    self.model.train()
    for input,target in self.train_dataloader:
      optimizer.zero_grad()
      # print("\ninput: ",input[0])
      # print("target: ",target[0])
      prediction=self.make_prediction(input,target)

      train_loss,train_acc=self.getLossAndAccuracy(prediction,target)

      train_loss.backward()
      optimizer.step()
      yield train_loss,train_acc

  def testingLoop(self):
    self.model.eval()
    with torch.inference_mode():
      for input,target in self.test_dataloader:

        prediction=self.make_prediction(input,target)
        test_loss,test_acc=self.getLossAndAccuracy(prediction,target)
        if self.currentEpoch%10 == 0:
          print("\ninput: ",tokinizer.decode(input[0]))
          print("\ntarget: ",tokinizer.decode(target[0]))
          print("\nprediction: ",tokinizer.decode(prediction[0].argmax(-1)))
        yield test_loss,test_acc

###Training Encocer

In [None]:
class TrainEncoder(TrainingUtil):
  def __init__(self, EPOCHS ,model,loss ,device ,savePath ,tokinizer, csvFilePath, testSplit):
    super().__init__(EPOCHS ,model,loss ,device ,savePath ,tokinizer, csvFilePath, testSplit)

  def trainingLoop(self):
    self.model.train()
    for input,target in self.train_dataloader:
      optimizer.zero_grad()
      prediction=self.make_prediction(input).type(torch.float32)
      train_loss,train_acc=self.getLossAndAccuracy(prediction,input)

      train_loss.backward()
      optimizer.step()
      yield train_loss,train_acc

  def testingLoop(self):
    self.model.eval()
    with torch.inference_mode():
      for input,target in self.test_dataloader:

        prediction=self.make_prediction(input).type(torch.float32)
        test_loss,test_acc=self.getLossAndAccuracy(prediction,input)

        if self.currentEpoch%10 == 0:
          print("\ninput: ",tokinizer.decode(input[0]))
          print("\nprediction: ",tokinizer.decode(prediction[0].argmax(-1)))

        yield test_loss,test_acc

###Start Training

In [None]:
EPOCHS=1500
pretrain="encoder"

In [None]:
if pretrain=="encoder":
  TrainEncoder(EPOCHS=EPOCHS,model=encoder,loss=loss,device=device,savePath="preTrainedEncoder_2",tokinizer=tokinizer,csvFilePath="/content/drive/MyDrive/preTrainingData.csv",testSplit=.8)
elif pretrain=="seq2seq":
  TrainNN(EPOCHS=EPOCHS,model=seq2seq,loss=loss,device=device,savePath="preTrainedSeq2seq_2",tokinizer=tokinizer,csvFilePath="/content/drive/MyDrive/preTrainingData.csv",testSplit=.8)
else:
  TrainNN(EPOCHS=EPOCHS,model=seq2seq,loss=loss,device=device,savePath="trainedSeq2seq_2",tokinizer=tokinizer,csvFilePath="/content/drive/MyDrive/shitpostCommentData.csv",testSplit=.8)


25690


  0%|          | 1/1500 [00:01<25:56,  1.04s/it]


 epoch: 1 | train_loss: 8.87, train_acc: 0.0% | test_loss: 8.66, test_acc: 0.0%


  0%|          | 2/1500 [00:02<25:50,  1.04s/it]


 epoch: 2 | train_loss: 8.66, train_acc: 0.0% | test_loss: 8.47, test_acc: 2.7%


  0%|          | 3/1500 [00:03<30:37,  1.23s/it]


 epoch: 3 | train_loss: 8.57, train_acc: 2.4% | test_loss: 8.39, test_acc: 2.8%


  0%|          | 4/1500 [00:05<34:17,  1.38s/it]


 epoch: 4 | train_loss: 8.44, train_acc: 3.3% | test_loss: 8.31, test_acc: 3.1%


  0%|          | 5/1500 [00:06<35:46,  1.44s/it]


 epoch: 5 | train_loss: 8.31, train_acc: 2.9% | test_loss: 8.19, test_acc: 7.2%


  0%|          | 6/1500 [00:08<37:34,  1.51s/it]


 epoch: 6 | train_loss: 8.20, train_acc: 6.8% | test_loss: 8.10, test_acc: 14.9%


  0%|          | 7/1500 [00:09<36:39,  1.47s/it]


 epoch: 7 | train_loss: 8.14, train_acc: 13.0% | test_loss: 8.10, test_acc: 12.6%


  1%|          | 8/1500 [00:10<33:17,  1.34s/it]


 epoch: 8 | train_loss: 8.06, train_acc: 15.3% | test_loss: 8.00, test_acc: 16.0%


  1%|          | 9/1500 [00:11<31:43,  1.28s/it]


 epoch: 9 | train_loss: 8.02, train_acc: 15.2% | test_loss: 7.96, test_acc: 17.0%


  1%|          | 10/1500 [00:13<31:09,  1.25s/it]


input:  party came for retaining her father as her campaign manager after his arrest on child sexual abuse charges she was

prediction:  yalta yalta sexuality months the heat months the coordinate and anti and this, and foul months the cultivating corresponds granted

 epoch: 10 | train_loss: 7.94, train_acc: 16.5% | test_loss: 7.89, test_acc: 17.4%


  1%|          | 11/1500 [00:14<30:59,  1.25s/it]


 epoch: 11 | train_loss: 7.95, train_acc: 19.0% | test_loss: 7.87, test_acc: 19.1%


  1%|          | 12/1500 [00:15<30:32,  1.23s/it]


 epoch: 12 | train_loss: 7.87, train_acc: 18.4% | test_loss: 7.84, test_acc: 19.2%


  1%|          | 13/1500 [00:16<29:08,  1.18s/it]


 epoch: 13 | train_loss: 7.81, train_acc: 19.9% | test_loss: 7.74, test_acc: 20.3%


  1%|          | 14/1500 [00:17<31:25,  1.27s/it]


 epoch: 14 | train_loss: 7.83, train_acc: 18.1% | test_loss: 7.78, test_acc: 20.6%





KeyboardInterrupt: ignored