In [1]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer, paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [2]:
from pathlib import Path
import os
# the folder 'text' contains all the files
paths = [str(x) for x in Path("./dataset/").glob("**/*.txt")]
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

#Model initialization

In [None]:
import tensorflow as tf
import torch.optim.lr_scheduler
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

KeyboardInterrupt: 

In [None]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 5

checkpoint_filepath = '/content/check/chec2'
history = model.fit(dataset, epochs=num_epoch)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


NotFoundError: ignored

In [None]:
text = "Necesito 3 cosas"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 240,
  num_beams = 5,
  temperature = 0.7,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(beam_output[0]))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


Necesito 3 cosas de la gente. "
"Conferencia de prensa matutina, desde Palacio Nacional. Conferencia de México. 

En la Ciudad de los mexicanos, a la corrupción, en el pueblo.

El pueblo y a los pueblos del pueblo, el derecho a las pueblos y la salud de todos los más y en la construcción de las país, pero es el país. Es la transformación, no es la democracia, la justicia, que se lo que no se pueblo de nuestro país y el presidente de manera. Les en lasCOVID19. En el compromiso de nuestros pueblos, con la prensa en vivo. Se la vida. Hoy los los que el bienestar. El pueblo del presidente del Bienestar, como la Conferencia matutina. La pueblo es un pueblo en México, es los es una prensa, lo
Les comparto el desarrollo del gobierno, Sonora. No en losCOVID que lo más la la política y los trabajadores de Estados Nacional, presidente pueblos de #COVID, nos es que la República, seCOVID
La salud, los derechos con el gobierno es las México y no no lo los la Guardia Nacional y con los pueblo para e

In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = '/content/check'
# creating directory if it is not present
if not os.path.exists(output_dir):
  os.mkdir(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(output_dir)

('/content/check/tokenizer_config.json',
 '/content/check/special_tokens_map.json',
 '/content/check/vocab.json',
 '/content/check/merges.txt',
 '/content/check/added_tokens.json')

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model  =TFGPT2LMHeadModel.from_pretrained(output_dir)

In [None]:
!pip --version