In [1]:
import os
import tensorflow as tf
import string
from transformers import BertTokenizer, TFBertForMaskedLM, XLNetTokenizer, TFXLNetModel, TFAutoModelWithLMHead, AutoTokenizer, logging
logging.set_verbosity_error()

In [2]:
no_words_to_be_predicted = globals()
select_model = globals()
enter_input_text = globals()

In [3]:
def set_model_config(**kwargs):
    for key, value in kwargs.items():
        print("{0} = {1}".format(key, value))
    # integer values
    no_words_to_be_predicted = list(kwargs.values())[0]
    # possible values = 'bert' or 'gpt' or 'xlnet'
    select_model = list(kwargs.values())[1]
    # only string any text
    enter_input_text = list(kwargs.values())[2]
    return no_words_to_be_predicted, select_model, enter_input_text

In [4]:
def load_model(model_name):
    try:
        if model_name.lower() == "bert":
            bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            bert_model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
            return bert_tokenizer, bert_model
        elif model_name.lower() == "gpt":
            gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
            gpt_model = TFAutoModelWithLMHead.from_pretrained("gpt2")
            return gpt_tokenizer, gpt_model
        else:
            xlnet_tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
            xlnet_model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
            return xlnet_tokenizer, xlnet_model
    except Exception as e:
        pass

In [5]:
# bert encode
def encode_bert(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'
    input_ids = tf.convert_to_tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = tf.where(input_ids == tokenizer.mask_token_id)[0][1].numpy()
    return input_ids, mask_idx

In [6]:
# bert decode
def decode_bert(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])

In [7]:
def get_prediction_end_of_sentence(input_text, model_name):
    try:
        if model_name.lower() == "bert":
            input_text += ' <mask>'
            print(input_text)
            res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
            return res
        elif model_name.lower() == "gpt":
            print(input_text)
            res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
            return res
        else:
            print(input_text)
            res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
            return res
    except Exception as error:
        pass

In [8]:
def get_bert_predictions(text_sentence, top_clean):
    input_ids, mask_idx = encode_bert(bert_tokenizer, text_sentence)
    with tf.GradientTape() as tape:
        outputs = bert_model(input_ids)
        predictions = outputs[0]

    mask_predictions = predictions[0, mask_idx]
    top_tokens = tf.math.top_k(mask_predictions, k=top_clean)
    top_tokens_ids = top_tokens.indices.numpy()

    return {
        'bert': decode_bert(bert_tokenizer, top_tokens_ids, top_clean)
    }

In [9]:
def get_gpt_predictions(text_sentence, top_clean):
    input_ids = gpt_tokenizer.encode(text_sentence, return_tensors='tf')
    output = gpt_model.generate(
        input_ids,
        max_length=input_ids.shape[1] + 1,
        num_return_sequences=top_clean,
        do_sample=True,
        top_k=top_clean,
        top_p=0.95,
    )

    predictions = [gpt_tokenizer.decode(out[input_ids.shape[1]:], skip_special_tokens=True) for out in output]
    return {
        'gpt': '\n'.join(predictions)
    }

In [10]:
def get_xlnet_predictions(text_sentence, top_clean):
    input_ids = xlnet_tokenizer.encode(text_sentence, return_tensors='tf')
    output = xlnet_model.generate(
        input_ids,
        max_length=input_ids.shape[1] + 1,
        num_return_sequences=top_clean,
        do_sample=True,
        top_k=top_clean,
        top_p=0.95,
    )

    predictions = [xlnet_tokenizer.decode(out[input_ids.shape[1]:], skip_special_tokens=True) for out in output]
    return {
        'xlnet': '\n'.join(predictions)
    }

In [11]:
def get_all_predictions(text_sentence, model_type, top_clean):
    if model_type.lower() == 'bert':
        return get_bert_predictions(text_sentence, top_clean)
    elif model_type.lower() == 'gpt':
        return get_gpt_predictions(text_sentence, top_clean)
    elif model_type.lower() == 'xlnet':
        return get_xlnet_predictions(text_sentence, top_clean)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

In [12]:
try:
    print("Next Word Prediction with TensorFlow using BERT, GPT, and XLNet")
    no_words_to_be_predicted, select_model, enter_input_text = set_model_config(no_words_to_be_predicted=1, select_model="bert", enter_input_text="He is  ")

    bert_tokenizer, bert_model = load_model("bert")
    gpt_tokenizer, gpt_model = load_model("gpt")
    xlnet_tokenizer, xlnet_model = load_model("xlnet")

    res = get_prediction_end_of_sentence(enter_input_text, select_model)
    print("result is: {}".format(res))

    if select_model.lower() == "bert":
        answer_bert = res['bert'].split("\n")
        answer_as_string_bert = " ".join(answer_bert)
        print("output answer is: {}".format(answer_as_string_bert))
    elif select_model.lower() == "gpt":
        answer_gpt = res['gpt'].split("\n")
        answer_as_string_gpt = " ".join(answer_gpt)
        print("output answer is: {}".format(answer_as_string_gpt))
    elif select_model.lower() == "xlnet":
        answer_xlnet = res['xlnet'].split("\n")
        answer_as_string_xlnet = " ".join(answer_xlnet)
        print("output answer is: {}".format(answer_as_string_xlnet))
except Exception as e:
    print('Some problem occurred:', str(e))

Next Word Prediction with TensorFlow using BERT, GPT, and XLNet
no_words_to_be_predicted = 1
select_model = bert
enter_input_text = He is  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/565M [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

He is   <mask>
result is: {'bert': 'dead'}
output answer is: dead


In [13]:
import tensorflow as tf
from transformers import TFBertForMaskedLM, BertTokenizer, BertConfig
from sklearn.model_selection import train_test_split
import numpy as np


In [14]:
# Load the text data
with open('1661-0.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text into sentences and preprocess
sentences = text.split('.')
sentences = [s.strip() for s in sentences if len(s) > 10]


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')


In [16]:
def create_masked_input(sentence, tokenizer, max_length=128):
    inputs = tokenizer(
        sentence,
        return_tensors='tf',
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    input_ids = inputs['input_ids'].numpy().squeeze()

    # Randomly select an index to mask
    mask_idx = np.random.randint(1, len(input_ids) - 1)
    original_token = input_ids[mask_idx]
    input_ids[mask_idx] = tokenizer.mask_token_id  # Replace with [MASK]

    labels = input_ids.copy()
    labels[mask_idx] = original_token  # Use original token as the label

    return {
        'input_ids': input_ids,
        'attention_mask': inputs['attention_mask'].numpy().squeeze(),
        'labels': labels
    }

# Prepare the dataset
max_length = 128
data = [create_masked_input(sentence, tokenizer, max_length) for sentence in sentences]


In [21]:
train_size = int(0.8 * len(data))
train_data = data[:train_size]
val_data = data[train_size:]

def data_generator(data, batch_size=8):
    while True:  # Loop infinitely to avoid running out of data
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            input_ids = np.array([d['input_ids'] for d in batch])
            attention_mask = np.array([d['attention_mask'] for d in batch])
            labels = np.array([d['labels'] for d in batch])
            yield (input_ids, attention_mask), labels



In [22]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)


In [None]:
batch_size = 8
train_steps = len(train_data) // batch_size
val_steps = len(val_data) // batch_size

model.fit(
    x=data_generator(train_data, batch_size),
    validation_data=data_generator(val_data, batch_size),
    steps_per_epoch=train_steps,
    validation_steps=val_steps,
    epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3

In [26]:
def predict_next_word(sentence):
    # Add a [MASK] token at the end of the input sentence to predict the next word
    sentence_with_mask = sentence + ' [MASK]'
    inputs = tokenizer(sentence_with_mask, return_tensors='tf')

    # Get the input IDs and attention mask
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Run the model to get the logits
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    # Get the logits for the [MASK] token (which is at the end of the input)
    mask_token_index = tf.where(input_ids == tokenizer.mask_token_id)[0, 1]
    mask_logits = logits[0, mask_token_index, :]

    # Get the predicted token index
    predicted_index = tf.argmax(mask_logits, axis=-1).numpy()
    predicted_token = tokenizer.decode([predicted_index])

    return predicted_token

# Example usage
print(predict_next_word("The book"))



“


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
def plot_training_history(history):
    plt.figure(figsize=(10, 5))

    # Plot training loss
    plt.plot(history.history['loss'], label='Training Loss', marker='o')

    # Plot validation loss
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')

    # Add title and labels
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Show plot
    plt.show()

# Call the function to plot the graph
plot_training_history(history)