In [100]:
from datasets import load_dataset
import tensorflow as tf
import numpy as np
from tokenizers import Tokenizer
from tokenizers import normalizers
from tokenizers.models import BPE
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, StripAccents
from sklearn.model_selection import train_test_split
import os

In [101]:
def get_all_sentences(ds):
    for i in range(len(ds)):
        yield ds[i]

In [102]:
def build_or_get_tokenizer(ds,path):

    tokenizer_path=path

    if not os.path.exists(path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"])
        tokenizer.train_from_iterator(get_all_sentences(ds),trainer=trainer)
        tokenizer.save(tokenizer_path)
    else:
        tokenizer=Tokenizer.from_file(tokenizer_path)

    return tokenizer

In [103]:
def get_tokenizer(ds,path):
    tokenizer_path=path
    
    if not os.path.exists(path):
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        normalizer=normalizers.Sequence([NFD(), StripAccents()])
        tokenizer.normalizer = normalizer
        tokenizer.pre_tokenizer=Whitespace()
        trainer = BpeTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],vocab_size=30000)
        tokenizer.train_from_iterator(get_all_sentences(ds),trainer=trainer)
        tokenizer.save(tokenizer_path)
    else:
        tokenizer=Tokenizer.from_file(tokenizer_path)

    return tokenizer
    

In [104]:
def filter_data(tokenizer,input,output,input_size,output_size):

    # Create input and output lists
    input_list=[]
    output_list=[]
    # Filter through the data and choose suitable examples
    for i in range(len(input)):
        in_tokenized = tokenizer.encode("[SOS] "+input[i]+" [EOS]").ids
        out_tokenized = tokenizer.encode(output[i]+" [EOS]").ids
        if len(in_tokenized)<=input_size and len(out_tokenized)<=output_size:
            input_list.append(in_tokenized)
            output_list.append(out_tokenized)
    
    return input_list,output_list

In [105]:
def padding_mapper(x,y,sos,input_size,output_size):

    # Create encoder input
    padding_length=input_size-len(x)
    padding_length=max(0,padding_length)
    encoder_input = tf.pad(x,np.array([[0,padding_length]]),constant_values=1)

    # Create encoder mask
    encoder_mask = np.arange(input_size)
    encoder_mask = encoder_mask<len(x)
    encoder_mask = encoder_mask.astype(int)
    encoder_mask = encoder_mask.reshape(1,1,input_size)

    # Create decoder input
    padding_length=output_size-len(sos)
    padding_length=max(0,padding_length)
    decoder_input = tf.pad(sos,np.array([[0,padding_length]]),constant_values=1)
    
    # identify padding length for y
    padding_length=output_size-len(y)
    padding_length=max(0,padding_length)
    decoder_output = tf.pad(y,np.array([[0,padding_length]]),constant_values=1)

    # Create Decoder mask
    decoder_mask = np.arange(output_size)
    decoder_mask = decoder_mask<len(sos)
    decoder_mask = decoder_mask.astype(int)
    decoder_mask = decoder_mask.reshape(1,1,output_size)

    return ((encoder_input,encoder_mask,decoder_input,decoder_mask),decoder_output)

In [106]:
def get_ds(data_path,tokenizer_path,input_size,output_size,split='train',key='INSTRUCTION',value='RESPONSE',get_tokenization=get_tokenizer):
    ds_raw=load_dataset(data_path)
    input=ds_raw[split][key]
    output=ds_raw[split][value]
    tokenizer_data=[input+output]

    # Build Tokenizer
    tokenizer = get_tokenization(tokenizer_data,tokenizer_path)

    # Filter Data
    input, output = filter_data(tokenizer,input, output,input_size,output_size)

    return input,output,tokenizer,len(input)

In [107]:
def data_loader(input,output,input_size,output_size,index=0):
    while(True):
        for i in range(index,len(input)):
            for j in range(15):
                encoder_input_list=[]
                encoder_mask_list=[]
                decoder_input_list=[]
                decoder_mask_list=[]
                decoder_output_list=[]
                batch_size=0
                for count in range(5):
                    predict_bound=j*20+(count+1)*4
                    predict_bound=350 if predict_bound>350 else predict_bound
                    knowledge_bound=j*20+(count)*4
                    if predict_bound<=len(output[i]):
                        batch_size+=1
                        temp=padding_mapper(input[i],output[i][:predict_bound],output[i][:knowledge_bound],input_size,output_size)
                        ((encoder_input,encoder_mask,decoder_input,decoder_mask),decoder_output)=temp
                        encoder_input_list.append(np.array(encoder_input))
                        encoder_mask_list.append(np.array(encoder_mask))
                        decoder_input_list.append(np.array(decoder_input))
                        decoder_mask_list.append(np.array(decoder_mask))
                        decoder_output_list.append(np.array(decoder_output))
                encoder_input_list=np.array(encoder_input_list)
                encoder_mask_list=np.array(encoder_mask_list)
                decoder_input_list=np.array(decoder_input_list)
                decoder_mask_list=np.array(decoder_mask_list)
                decoder_output_list=np.array(decoder_output_list)

                if batch_size>0:
                    yield ((encoder_input_list,encoder_mask_list,decoder_input_list,decoder_mask_list),decoder_output_list)

In [113]:
path=r"C:\Users\Sagar\Python files_Jupiter\Git Repo Local\GAN Poetry\tokenizers\tokenizer_{0}.json"
input,output,tokenizer,length = get_ds("iamketan25/poem-instructions-dataset",path,50,300,'train','prompt','chosen')
# input,output,tokenizer,length = get_ds("checkai/instruction-poems",path,50,200)

Found cached dataset parquet (C:/Users/Sagar/.cache/huggingface/datasets/iamketan25___parquet/iamketan25--poem-instructions-dataset-8ab844a537ba1e14/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [114]:
length

436

In [95]:
X_train,X_test,Y_train,Y_test=train_test_split(input,output,random_state=1,test_size=0.05,shuffle=True)
train=data_loader(X_train,Y_train,50,350)
test=data_loader(X_test,Y_test,50,350)

In [16]:
class myCallback(tf.keras.callbacks.Callback):
    def __init__(self,training_model,eval_X=None,eval_Y=None,tokenizer=None,input_size=50,output_size=350):
        self.training_model=training_model
        self.eval_X=eval_X
        self.eval_Y=eval_Y
        self.tokenizer=tokenizer
        self.input_size=input_size
        self.output_size=output_size

    def on_epoch_end(self,epoch,logs={}):
        self.training_model.save(f"C:/Users/Sagar/Python files_Jupiter/Git Repo Local/GAN Poetry/results/model_{epoch}.h5")
        if (self.eval_X is not None) and (self.eval_Y is not None) and (self.tokenizer is not None):
            self.model_evaluate(epoch)

    def model_evaluation(self,epoch=0):
        count=0
        for item in data_loader(self.eval_X,self.eval_Y,self.input_size,self.output_size):
            y_predict=self.model.predict(item[0])
            y_predict=tf.math.argmax(y_predict,axis=-1)
            for i in range(len(y_predict)):
                y_hat=self.tokenizer.decode(y_predict[i])
                y=self.tokenizer.decode(item[1][i])
                with open(f"C:/Users/Sagar/Python files_Jupiter/Git Repo Local/GAN Poetry/results/logs/transformer_log_epoch_{epoch}_{count}_{i}.txt","w") as text_file:
                    print(f"y_hat:\n{y_hat}\ny:\n{y}",file=text_file)
            count+=1
            if count>=35:
                break

In [99]:
tokenizer.decode(output[0])

'I \' ve written a poem for you about Living , Death , The Body , Nature . The title is "[ ready to receive remains . . .]": ready to receive remains built for death , ready to receive the flat ly desolate super ficial deeply commission ed intellectual offer of suggest ive actions , for the hunger assassin to fall back on and become force full psych ological damage , bottled for drink able agitation . riding a back seat writing construction , contest ing the oncoming molten universe , immersed in villagers , city dwell ers , trembling , laughing , ( white teeth red one for the perfect test of time ), to inhale flesh and stone from long ago , forgetting the horrors of holy oil inf usion clocks and gritty body galleries , leaving behind the mourning river ’ s crimson fragrance smoldering from the previous unbearable fever . in a posture of myself on a speeding body , without hands and feet , I am ready to receive the vomit of consciousness and proceed down the avenues of suggestion to be