In [1]:
# !pip install datasets

In [2]:
import sys
sys.path.append('../src')
from transformer_encoder import TransformerEncoderV3
import seaborn as sns
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from tensorflow.keras.models import Model

2024-05-10 12:02:47.872725: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-10 12:02:47.900092: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-10 12:02:48.674048: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-10 12:02:51.489027: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def mask_tokens(inputs, vocab_size, mask_token_id=103):
    # Ensure inputs are in the correct dtype, typically tf.int32 for token IDs
    inputs = tf.cast(inputs, tf.int32)
    mask_token_id = tf.constant(mask_token_id, dtype=tf.int32)

    # Randomly choose 15% of the token positions to be masked
    rand = tf.random.uniform(shape=tf.shape(inputs), dtype=tf.float32)
    mask_positions = tf.cast(tf.less(rand, 0.15), dtype=tf.int32)
    
    # Generate masks for deciding which type of masking to apply
    mask_types = tf.random.uniform(shape=tf.shape(inputs), minval=0, maxval=1, dtype=tf.float32)
    
    # Apply different masking strategies
    inputs_masked = tf.where(
        tf.less(mask_types, 0.8),  # 80% - Mask token
        tf.fill(tf.shape(inputs), mask_token_id),  # Use tf.fill to ensure correct dtype
        tf.where(
            tf.less(mask_types, 0.9),  # 10% - Replace with a random token
            tf.random.uniform(shape=tf.shape(inputs), minval=0, maxval=vocab_size, dtype=tf.int32),
            inputs  # 10% - Leave unchanged
        )
    )
    
    # Only replace the positions that were selected for masking
    outputs = tf.where(tf.equal(mask_positions, 1), inputs_masked, inputs)
    
    # Create labels for masked positions, -1 where not masked
    labels = tf.where(tf.equal(mask_positions, 1), inputs, -1)
    
    return outputs, mask_positions, labels



# Example usage:
vocab_size = 30522  # Example vocabulary size for BERT
tokens = tf.constant([[101, 1024, 768, 205, 1996, 3849, 102]])  # Example input token IDs
masked_tokens, mask_positions, labels = mask_tokens(tokens, vocab_size)
print(masked_tokens)
print(mask_positions)
print(labels)

tf.Tensor([[ 101 1024  768  205  103 3849  102]], shape=(1, 7), dtype=int32)
tf.Tensor([[0 0 0 0 1 0 0]], shape=(1, 7), dtype=int32)
tf.Tensor([[  -1   -1   -1   -1 1996   -1   -1]], shape=(1, 7), dtype=int32)


In [4]:
def prepare_nsp_data(sentences, tokenizer, max_length=128):
    # Generate training examples for NSP
    examples = []
    labels = []

    # Loop through each sentence and form pairs
    for i in range(len(sentences) - 1):
        # Positive example (consecutive sentences)
        tokenized_positive = tokenizer.encode_plus(sentences[i], sentences[i+1], max_length=max_length, truncation=True)
        examples.append(tokenized_positive['input_ids'])
        labels.append(1)  # Label for consecutive

        # Negative example (random sentences)
        random_index = np.random.randint(0, len(sentences))
        tokenized_negative = tokenizer.encode_plus(sentences[i], sentences[random_index], max_length=max_length, truncation=True)
        examples.append(tokenized_negative['input_ids'])
        labels.append(0)  # Label for non-consecutive

    return examples, labels


In [6]:


# # Load tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Example sentences (simulating extracted sentences from larger documents)
# sentences = ["This is the first sentence.", "This is the second sentence.", "This is an unrelated sentence."]

# # Prepare NSP data
# nsp_examples, nsp_labels = prepare_nsp_data(sentences, tokenizer)

# # Masking tokens in the examples (assuming we flatten examples for simplicity)
# flat_examples = np.concatenate(nsp_examples)
# masked_inputs, _ = mask_tokens(flat_examples, vocab_size=tokenizer.vocab_size)



In [None]:
import random

def prepare_mlm_and_nsp_data(texts, tokenizer, mlm_probability=0.15, max_length=128):
    # Split the text into individual sentences
    sentence_lists = [text.split('.') for text in texts]
    examples = []
    labels = []

    for sentences in sentence_lists:
        # Create examples from each list of sentences
        for i in range(len(sentences) - 1):
            if random.random() < 0.5:  # 50% of the time, use the next sentence as the target
                examples.append((sentences[i], sentences[i + 1], 1))
            else:  # 50% of the time, use a random sentence as the target
                target = random.choice(sentences)
                examples.append((sentences[i], target, 0))

    input_ids, token_type_ids, attention_masks, nsp_labels = [], [], [], []

    for first, second, label in examples:
        # Tokenize and concatenate the first and second sentences with [SEP] token
        tokenized = tokenizer.encode_plus(first, second, add_special_tokens=True, max_length=max_length, padding="max_length", truncation=True)
        input_ids.append(tokenized['input_ids'])
        token_type_ids.append(tokenized['token_type_ids'])
        attention_masks.append(tokenized['attention_mask'])
        nsp_labels.append(label)

    # Convert lists to tensors
    input_ids = tf.constant(input_ids)
    token_type_ids = tf.constant(token_type_ids)
    attention_masks = tf.constant(attention_masks)
    nsp_labels = tf.constant(nsp_labels)

    return input_ids, token_type_ids, attention_masks, nsp_labels

# texts = dataset['text'][:10]  # Using a subset for this example
input_ids, token_type_ids, attention_masks, nsp_labels = prepare_mlm_and_nsp_data(texts, tokenizer)


In [8]:
from datasets import load_dataset

# Load an example dataset, 'wikipedia' for English, 2020-03-01 version
dataset = load_dataset("wikipedia", "20220301.en", split=["train"])
# print(dataset[0])
article_texts_dataset = dataset[0]['text'][:1000]

# Extract text and write to a file
with open('input_text.txt', 'w', encoding='utf-8') as f:
    for article  in article_texts_dataset:
        # Write each Wikipedia article on a new line
        f.write(article.replace('\n', ' ') + '\n')


In [None]:
# print(dataset[0]['text'][1])

In [9]:
!python create_pretraining_data.py --vocab_file vocab.txt --input_text input_text.txt --output_tfrecord output.tfrecord --do_lower_case --nsp


Traceback (most recent call last):
  File "/mnt/d/MyDev/attention/transformerlab/bert/create_pretraining_data.py", line 103, in <module>
    instances = create_training_instances(
TypeError: create_training_instances() missing 1 required positional argument: 'nsp_enabled'


In [None]:
import tensorflow as tf

def _parse_function(proto):
    # Define your tfrecord again. It must be the same as the one used for saving your data.
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([128], tf.int64),  # Assuming input_ids are of length 128
        'segment_ids': tf.io.FixedLenFeature([128], tf.int64),  # Assuming segment_ids are of length 128
        'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
        'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
        'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64),
    }

    # Load one example
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    
    # Turn your sparse array into a dense array with default values as 0
    parsed_features['masked_lm_positions'] = tf.sparse.to_dense(parsed_features['masked_lm_positions'], default_value=0)
    parsed_features['masked_lm_labels'] = tf.sparse.to_dense(parsed_features['masked_lm_labels'], default_value=0)

    return parsed_features
# Read the TFRecord file
def load_dataset(file_path):
    dataset = tf.data.TFRecordDataset(file_path)
    dataset = dataset.map(_parse_function)  # Parse the record into tensors.
    return dataset

# Path to the TFRecord file
tfrecord_file_path = 'output.tfrecord'

# Load the dataset
parsed_dataset = load_dataset(tfrecord_file_path)
# Display a few examples from the dataset
for parsed_record in parsed_dataset.take(2):  # Only take first 5 examples
    print('Input IDs:', parsed_record['input_ids'].numpy())
    print('Segment IDs:', parsed_record['segment_ids'].numpy())
    print('Masked LM Positions:', parsed_record['masked_lm_positions'].numpy())
    print('Masked LM Labels:', parsed_record['masked_lm_labels'].numpy())
    print('Next Sentence Label:', parsed_record['next_sentence_labels'].numpy())
    print('---')
