In [27]:
import sys
import random
import re
sys.path.append('../src')
import pandas as pd
from transformer_encoder import TransformerEncoderV3  
from positional_encoding import encode_pos_sin_cosine
import seaborn as sns
import numpy as np
import nltk
from datasets import load_dataset
from transformers import BertTokenizer
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy


In [2]:
from datasets import load_dataset
# Load an example dataset, 'wikipedia' for English, 2020-03-01 version
dataset = load_dataset("wikipedia", "20220301.en", split="train")
print('how the dataset looks:', dataset[0].keys())
num_of_articles = 1000
# article_texts_dataset = dataset[0]['text'][:num_of_articles]
# # Extract text and write to a file
# with open('input_text.txt', 'w', encoding='utf-8') as f:
#     for article  in article_texts_dataset:
#         # Write each Wikipedia article on a new line
#         f.write(article.replace('\n', ' ') + '\n')

how the dataset looks: dict_keys(['id', 'url', 'title', 'text'])


In [3]:
first_article = dataset[0]['text']
print('words in first_article:', len(first_article))
print('how the article look:\n',first_article[:1500])
print('..........................................\n........................')
print(first_article[-500:])

words in first_article: 43985
how the article look:
 Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.

Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of th

Document and Sentence Segmentation:

Document Boundary: Each Wikipedia article can be treated as a single document. This aligns with the BERT requirement where each document is separated by blank lines.

Sentence Tokenization: Use a sentence tokenizer to convert each paragraph into distinct sentences. This is crucial because BERT's NSP task assumes that two consecutive sentences in the data might be used as training pairs.

In [4]:
def save_articles_with_doc_boundary(dataset_name, config_name, split_name, output_file_path, num_of_articles=1000):
    nltk.download('punkt')
    dataset = load_dataset(dataset_name, config_name, split=split_name)
    articles_to_process = dataset.select(range(num_of_articles))
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for i, article in enumerate(articles_to_process):
            title = article.get('title', f"No Title Available for Article {i}")
            art_id = article.get('id', "No ID")
            art_url = article.get('url', "No URL")
            sentences = nltk.sent_tokenize(article['text'])
            full_article_text = f"ARTICLE-{i}-{art_id}-{art_url}-{title}\n" + '\n'.join(sentences) + '\n\n'
            file.write(full_article_text)
# Call the function to process and save articles
output_file_path = 'wiki_articles_with_seperator.txt'
save_articles_with_doc_boundary('wikipedia', '20220301.en', 'train', output_file_path)

[nltk_data] Downloading package punkt to /home/bhujay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def display_article_lines(file_path, num_lines=5, num_articles=5):    
    with open(file_path, 'r', encoding='utf-8') as file:
        article_count = 0
        line_count = 0
        for line in file:
            if line.startswith('ARTICLE-'):  # New article detected
                if article_count >= num_articles:
                    break                          
                article_count += 1
                line_count = 0  # Reset line count for the new article
            if line_count < num_lines:
                print(line)
                line_count += 1
            else:
                continue  # Skip further lines until the next article starts
output_file_path = 'wiki_articles_with_seperator.txt'
display_article_lines(output_file_path, num_lines=5, num_articles=5)

ARTICLE-0-12-https://en.wikipedia.org/wiki/Anarchism-Anarchism

Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy.

Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful.

As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.

Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires.

ARTICLE-1-25-https://en.wikipedia.org/wiki/Autism-Autism

Autism is a neurodevelopmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior.

Parents often notice signs during th

In [20]:
class TrainingInstance:
    """A single training instance (sentence pair)."""
    def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, is_random_next):
        self.tokens = tokens
        self.segment_ids = segment_ids
        self.masked_lm_positions = masked_lm_positions
        self.masked_lm_labels = masked_lm_labels
        self.is_random_next = is_random_next

    def __str__(self):
        tokens_str = " ".join([str(token) for token in self.tokens])
        segment_ids_str = " ".join(map(str, self.segment_ids))
        masked_lm_positions_str = " ".join(map(str, self.masked_lm_positions))
        masked_lm_labels_str = " ".join([str(label) for label in self.masked_lm_labels])
        return f"Tokens: {tokens_str}\nSegment IDs: {segment_ids_str}\n" \
               f"Is Random Next: {self.is_random_next}\n" \
               f"Masked LM Positions: {masked_lm_positions_str}\n" \
               f"Masked LM Labels: {masked_lm_labels_str}\n"

    def __repr__(self):
        return self.__str__()

def mask_tokens(tokens, tokenizer, max_predictions_per_seq, rng):
    """Masks tokens and returns masked tokens and corresponding labels."""
    output_tokens = tokens[:]
    output_labels = [-1] * len(tokens)  # Initialize labels with -1 (no change)

    # Determine which tokens can be masked
    candidate_indices = [
        i for i, token in enumerate(tokens) 
        if token not in [tokenizer.cls_token, tokenizer.sep_token]
    ]
    rng.shuffle(candidate_indices)
    num_masked = min(max_predictions_per_seq, len(candidate_indices) * 15 // 100)
    
    for index in candidate_indices[:num_masked]:
        random_choice = rng.random()
        # 80% replace with [MASK], 10% random token, 10% unchanged
        if random_choice < 0.8:
            output_tokens[index] = tokenizer.mask_token
        elif random_choice < 0.9:
            output_tokens[index] = random.choice(list(tokenizer.vocab.keys()))
        
        output_labels[index] = tokenizer.convert_tokens_to_ids(tokens[index])

    return output_tokens, output_labels

def truncate_and_process(tokens_a, tokens_b, max_seq_length, tokenizer, max_predictions_per_seq, instances, rng, is_random_next):
    # Truncate tokens_a and tokens_b if their combined length is too long
    while len(tokens_a) + len(tokens_b) + 3 > max_seq_length:
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

    tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
    segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)

    masked_tokens, masked_labels = mask_tokens(tokens, tokenizer, max_predictions_per_seq, rng)

    # Convert masked_tokens to IDs
    token_ids = tokenizer.convert_tokens_to_ids(masked_tokens)  # Ensure this returns integers
    instance = TrainingInstance(
        tokens=token_ids,
        segment_ids=segment_ids,
        masked_lm_positions=[i for i, label in enumerate(masked_labels) if label != -1],
        masked_lm_labels=[label for label in masked_labels if label != -1],
        is_random_next=int(is_random_next)
    )
    # instance = {
    #     'tokens': token_ids,
    #     'segment_ids': segment_ids,
    #     'masked_lm_positions': [i for i, label in enumerate(masked_labels) if label != -1],
    #     'masked_lm_labels': [label for label in masked_labels if label != -1],
    #     'is_random_next': int(is_random_next)
    # }
    instances.append(instance)



In [21]:
def create_bert_pretraining_instances_in_chunks(file_path, chunk_size=1048576, 
                          doc_boundary_pattern=r'ARTICLE-\d+-\d+-https:\/\/\S+',
                         test_print=10, max_seq_length=128, max_predictions_per_seq=20, 
                         dupe_factor=5, random_seed=12345, nsp_enabled=True):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    rng = random.Random(random_seed)
    buffer = ''
    instances = []    
    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            buffer += chunk
            documents = re.split(doc_boundary_pattern, buffer, flags=re.MULTILINE)            
            if documents and not re.match(doc_boundary_pattern, documents[-1]):
                buffer = documents.pop()
            else:
                buffer = ''            
            for doc in documents:
                if not doc.strip():
                    continue
                tokenized_doc = tokenizer.tokenize(doc)
                sequences = [tokenized_doc[i:i+max_seq_length] for i in range(0, len(tokenized_doc), max_seq_length)]                
                for j in range(len(sequences) - 1):
                    tokens_a = sequences[j]
                    if rng.random() > 0.5 or not nsp_enabled:
                        is_random_next = True
                        tokens_b = sequences[rng.randint(0, len(sequences) - 1)]
                    else:
                        is_random_next = False
                        tokens_b = sequences[j + 1]

                    truncate_and_process(tokens_a, tokens_b, max_seq_length, tokenizer, max_predictions_per_seq, instances, rng, is_random_next)

                    if test_print > 0:
                        print(f"Tokens A: {tokens_a}, len: {len(tokens_a)}")
                        print(f"Tokens B: {tokens_b[:10]}")
                        print(f"Is random next: {is_random_next}\n")
                        test_print -= 1
                        if test_print == 0:
                            return instances
    return instances  # Return all instances for further processing or training
file_path = 'wiki_articles_with_seperator.txt'
res = create_bert_pretraining_instances_in_chunks(file_path)

Tokens A: ['ana', '##rch', '##ism', 'is', 'a', 'political', 'philosophy', 'and', 'movement', 'that', 'is', 'sc', '##ept', '##ical', 'of', 'authority', 'and', 'rejects', 'all', 'involuntary', ',', 'coe', '##rc', '##ive', 'forms', 'of', 'hierarchy', '.', 'ana', '##rch', '##ism', 'calls', 'for', 'the', 'abolition', 'of', 'the', 'state', ',', 'which', 'it', 'holds', 'to', 'be', 'unnecessary', ',', 'und', '##es', '##ira', '##ble', ',', 'and', 'harmful', '.', 'as', 'a', 'historically', 'left', '-', 'wing', 'movement', ',', 'placed'], len: 63
Tokens B: [',', 'or', 'empires', '.', 'with', 'the', 'rise', 'of', 'organised', 'hierarchical']
Is random next: False

Tokens A: [',', 'or', 'empires', '.', 'with', 'the', 'rise', 'of', 'organised', 'hierarchical', 'bodies', ',', 'sc', '##ept', '##ici', '##sm', 'toward', 'authority', 'also', 'rose', '.', 'although', 'traces', 'of', 'anarchist', 'thought', 'are', 'found', 'throughout', 'history', ',', 'modern', 'ana', '##rch', '##ism', 'emerged', 'from', 

In [23]:
print(res[0])

Tokens: 101 9617 11140 2964 2003 103 2576 4695 103 2929 2008 2003 8040 23606 7476 1997 3691 1998 103 2035 26097 1010 24873 11890 3512 3596 4090 12571 1012 9617 11140 2964 4455 2005 1996 15766 1997 1996 2110 1010 103 2009 4324 2000 2022 14203 1010 103 2229 7895 3468 1010 1998 17631 1012 2004 1037 7145 2187 1011 3358 30221 1010 2872 102 1010 2030 23560 1012 2007 1996 4125 1997 7362 25835 103 1010 8040 23606 103 6491 2646 3691 2036 3123 1012 103 103 1997 103 2245 2024 2179 103 103 1010 2715 9617 11140 2964 6003 2013 1996 16724 1012 2076 1996 3732 2431 1997 17571 3708 1998 1996 2034 5109 1997 1996 3983 2301 1010 1996 18448 103 17893 1999 2087 102
Segment IDs: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Is Random Next: 0
Masked LM Positions: 5 8 18 26 40 47 51 61 63 75 79 86 87 89 9

In [28]:
def save_instances_as_parquet(instances, file_path):
    data = [instance.__dict__ for instance in instances]  # Convert instance objects to dictionaries
    df = pd.DataFrame(data)
    df.to_parquet(file_path, engine='pyarrow')  # Save as Parquet

# Example usage
instances = create_bert_pretraining_instances_in_chunks(file_path, test_print=0)
pres = save_instances_as_parquet(instances, 'pretraining_bert_data.parquet')

In [31]:
### Convert these articles to pretraining data 
# !python create_pretraining_data.py --vocab_file vocab.txt --input_text input_text.txt --output_tfrecord output.tfrecord --do_lower_case --nsp


In [2]:
vocab_size = 30000  # Smaller vocabulary size for simplicity
num_layers = 2  # Fewer layers
d_model = 256  # Smaller dimensionality
num_heads = 4
dff = 512
segment_size = 2
max_seq_length = 128

In [6]:
def load_and_print_dataset(filepath):
    raw_dataset = tf.data.TFRecordDataset(filepath)
    for i, raw_record in enumerate(raw_dataset):  # Adjust the number based on how many you want to check
        # print("Raw record:", raw_record.numpy())
        try:
            example = tf.io.parse_single_example(
                raw_record,
                {
                    # 'input_ids': tf.io.FixedLenFeature([128], tf.int64),
                    'input_ids': tf.io.VarLenFeature(tf.int64),
                    'segment_ids': tf.io.VarLenFeature(tf.int64),
                    'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
                    'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
                    'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64)
                }
            )
            # print("Parsed example:", example)
        except Exception as e:
            print(f"Failed to parse record {i}: {e}")
            print("Raw record:", raw_record.numpy())
            break  # Or continue based on how you want to handle errors

# Example usage
load_and_print_dataset('output.tfrecord')


2024-05-12 17:40:22.878046: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [23]:
def parse_tfrecord(serialized_example):
    feature_description = {
        'input_ids': tf.io.VarLenFeature(tf.int64),
        'segment_ids': tf.io.VarLenFeature(tf.int64),
        'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
        'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
        'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64)
    }
    # print(serialized_example)
    example = tf.io.parse_single_example(serialized_example, feature_description)
    # print(example)
    input_ids = tf.cast(example['input_ids'], tf.int32)
    input_ids = tf.sparse.to_dense(input_ids)
    segment_ids = tf.cast(example['segment_ids'], tf.int32)
    segment_ids = tf.sparse.to_dense(segment_ids)
    masked_lm_positions = tf.sparse.to_dense(example['masked_lm_positions'])
    masked_lm_labels = tf.sparse.to_dense(example['masked_lm_labels'])
    next_sentence_labels = tf.cast(example['next_sentence_labels'], tf.int32)
    inputs = {'input_ids': input_ids, 'segment_ids': segment_ids}
    labels = {'masked_lm_positions': masked_lm_positions,
              'mlm_labels': masked_lm_labels, 'nsp_labels': next_sentence_labels}
    return (inputs, labels)

def load_dataset(filepath, batch_size):
    raw_dataset = tf.data.TFRecordDataset(filepath)
    parsed_dataset = raw_dataset.map(parse_tfrecord)
    # Define padding shapes for each component of the dataset
    padded_shapes = ({
        'input_ids': [None],  # Dynamic padding for input_ids
        'segment_ids': [None]  # Dynamic padding for segment_ids
    }, {
        'masked_lm_positions': [None],  # Dynamic padding for positions
        'mlm_labels': [None],  # Dynamic padding for mlm labels
        'nsp_labels': []  # No padding needed for scalar labels
    })

    # Use padded_batch to handle variable sequence lengths
    batched_dataset = parsed_dataset.padded_batch(batch_size, padded_shapes=padded_shapes)
    # batched_dataset = parsed_dataset.batch(batch_size)    
    return batched_dataset

# Usage
batch_size = 32
train_dataset = load_dataset('output.tfrecord', batch_size)

In [24]:
single_test_instance = iter(train_dataset.take(1)).next()
single_input_tuple = single_test_instance[0]['input_ids'], single_test_instance[0]['segment_ids']
print(single_test_instance)
print()

({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[  101,  9617, 11140, ...,  2591,  4813,   102],
       [  101, 19465,  2003, ...,   103,  1997,   102],
       [  101,  2632, 28759, ...,  3774,  1997,   102],
       ...,
       [  101,   103, 11332, ..., 12626,  1997,   102],
       [  101,  1996, 16951, ...,  1996,  7842,   102],
       [  101, 17694, 15396, ...,   103, 24988,   102]], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]], dtype=int32)>}, {'masked_lm_positions': <tf.Tensor: shape=(32, 18), dtype=int64, numpy=
array([[  5,   8,  18,  26,  40,  47,  51,  61,  63,  75,  79,  86,  87,
         89,  93,  94, 110, 123],
       [ 19,  22,  31,  33,  37,  45,  46,  49,  56,  61,  64,  85,  95,
         98, 108, 111, 112, 125],
 

In [25]:
class PositionalAndSegmentEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, segment_size, d_model, max_pos=2048, pos_dropout=0.1, **kwargs):
        super().__init__(**kwargs)  # Initialize the superclass (Layer)
        self.d_model = d_model  # Store the dimensionality of the model embeddings
        self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.segment_embedding = tf.keras.layers.Embedding(segment_size, d_model)
        self.pos_encoding = encode_pos_sin_cosine(max_pos, d_model, debug=False)
        self.dropout = tf.keras.layers.Dropout(pos_dropout)

    def compute_mask(self, inputs, *args, **kwargs):
        # Assuming the input structure is a tuple of (tokens, segments)
        token_inputs, _ = inputs
        return self.token_embedding.compute_mask(token_inputs, *args, **kwargs)

    def call(self, inputs, training=False):
        # Expect inputs to be a tuple (token_inputs, segment_inputs)
        token_inputs, segment_inputs = inputs
        tokens = self.token_embedding(token_inputs)  # Token embeddings
        segments = self.segment_embedding(segment_inputs)  # Segment embeddings       
        x = tokens + segments
        # Scale the embeddings by the square root of the embedding dimension size
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        # Add positional encoding to the combined embeddings, sliced to match the input length
        length = tf.shape(x)[1]
        # pos_encodings = tf.reshape(self.pos_encoding, (1, -1, self.d_model))[:, :length, :]
        pos_encodings = tf.cast(tf.reshape(self.pos_encoding, (1, -1, self.d_model))[:, :tf.shape(x)[1], :], tf.float32)
        x += pos_encodings
        x = self.dropout(x, training=training)
        return x

embedding_layer = PositionalAndSegmentEmbedding(vocab_size=vocab_size, segment_size=2, d_model=256)

# Extract a single batch from the dataset
for inputs, labels in train_dataset.take(1):
    # The inputs dictionary contains 'input_ids' and 'segment_ids'
    input_ids = inputs['input_ids']
    segment_ids = inputs['segment_ids']

    # Call the embedding layer
    embeddings = embedding_layer((input_ids, segment_ids))

    # Print the output shape
    print("Output shape:", embeddings.shape)
    print("Output shape:", embeddings._keras_mask)
    break

Output shape: (32, 128, 256)
Output shape: tf.Tensor(
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]], shape=(32, 128), dtype=bool)


In [26]:
class TransformerEncoderV4(TransformerEncoderV3):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, segment_size, max_pos=2048, pos_dropout=0.1, **kwargs):
        super(TransformerEncoderV4, self).__init__(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=vocab_size, max_pos=max_pos, **kwargs)
        # Use the custom embedding layer that handles tokens, segments, and positional encodings
        self.embedding_layer = PositionalAndSegmentEmbedding(vocab_size, segment_size, d_model, max_pos, pos_dropout)

    def call(self, inputs, training=False):
        input_ids, segment_ids = inputs
        # The embedding layer now handles everything including token, segment, and positional embeddings
        x = self.embedding_layer((input_ids, segment_ids), training=training)
        x = self.enc_layers_0(x, training=training)
        for i in range(self.remaining_layers):
            x = self.enc_layers[i](x, training=training)
        return x
tren = TransformerEncoderV4(num_layers, d_model, num_heads, dff, vocab_size, segment_size, max_pos=max_seq_length)

encoder_out = tren(single_input_tuple)
print(encoder_out.shape)

(32, 128, 256)


In [27]:
class BERT(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, segment_size, max_seq_length=128, rate=0.1):
        super(BERT, self).__init__()
        self.encoder = TransformerEncoderV4(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                                            dff=dff, vocab_size=vocab_size, segment_size=segment_size,
                                            max_pos=max_seq_length, pos_dropout=rate)
        self.mlm_dense = tf.keras.layers.Dense(vocab_size, activation='softmax')  # Ensures output shape is [batch, seq_length, vocab_size]
        self.nsp_dense = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs, training=False):
        x = self.encoder((inputs['input_ids'], inputs['segment_ids']), training=training)
        mlm_output = self.mlm_dense(x)  # Check shapes here
        nsp_output = self.nsp_dense(x[:, 0, :])
        return {'mlm_output': mlm_output, 'nsp_output': nsp_output}


bert_model = BERT(num_layers, d_model, num_heads, 
                  dff, vocab_size, segment_size)
bert_out = bert_model(single_test_instance[0])
print(bert_out)

{'mlm_output': <tf.Tensor: shape=(32, 128, 30000), dtype=float32, numpy=
array([[[3.8340859e-05, 3.7446305e-05, 2.8395381e-05, ...,
         2.6639331e-05, 3.7298159e-05, 3.4331169e-05],
        [3.5365043e-05, 3.5282104e-05, 3.7169062e-05, ...,
         2.2325734e-05, 3.5428879e-05, 3.0616156e-05],
        [3.5435303e-05, 3.2537922e-05, 3.5552261e-05, ...,
         2.3641700e-05, 3.7744659e-05, 3.3079072e-05],
        ...,
        [3.8897797e-05, 3.2174790e-05, 3.5057983e-05, ...,
         2.0151998e-05, 3.3009997e-05, 3.7184676e-05],
        [3.8864127e-05, 2.8284470e-05, 3.2879325e-05, ...,
         2.0593974e-05, 3.2062126e-05, 4.0074581e-05],
        [4.0870749e-05, 3.0497569e-05, 3.2746706e-05, ...,
         1.9698917e-05, 3.7209582e-05, 3.7094494e-05]],

       [[3.8267041e-05, 3.6553694e-05, 2.8442262e-05, ...,
         2.6696673e-05, 3.8556202e-05, 3.5093311e-05],
        [3.3343262e-05, 3.7650851e-05, 3.7500002e-05, ...,
         2.4165498e-05, 3.2304510e-05, 3.7988360e-05],


In [28]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_object_mlm = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, 
                                                                reduction=tf.keras.losses.Reduction.NONE)
loss_object_nsp = tf.keras.losses.BinaryCrossentropy(from_logits=True)


def compute_mlm_loss(masked_positions, masked_labels, logits):
    # Gather the logits at the masked positions
    masked_logits = tf.gather(logits, masked_positions, batch_dims=1, axis=1)
    
    # Ensure that the masked_labels used here are the correct length and match the number of masked_positions
    mlm_loss = tf.keras.losses.sparse_categorical_crossentropy(masked_labels, masked_logits, from_logits=True)

    # Reduce mean across batches if needed or sum as appropriate
    return tf.reduce_mean(mlm_loss)


@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        predictions = bert_model(inputs, training=True)  # Predictions will have 'mlm_output' and 'nsp_output'
        # Compute the MLM loss using the positions and labels
        loss_mlm = compute_mlm_loss(labels['masked_lm_positions'], 
                                    labels['mlm_labels'], predictions['mlm_output'])
        # NSP loss remains the same
        loss_nsp = loss_object_nsp(labels['nsp_labels'], predictions['nsp_output'])
        total_loss = loss_mlm + loss_nsp

    gradients = tape.gradient(total_loss, bert_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, bert_model.trainable_variables))
    return total_loss, loss_mlm, loss_nsp

# Ensure that your dataset loading function is correctly parsing and returning 'masked_lm_positions' and 'masked_lm_labels'

batch_size = 16
train_dataset = load_dataset('output.tfrecord', batch_size)
# Training loop
epochs = 3
for epoch in range(epochs):
    step = 0
    for inputs, labels in train_dataset:
        loss_values = train_step(inputs, labels)
        if step % 10 == 0:
            print(f"Epoch {epoch + 1}, Step {step}, Total Loss: {loss_values[0].numpy():.4f}, MLM Loss: {loss_values[1].numpy():.4f}, NSP Loss: {loss_values[2].numpy():.4f}")
        step += 1

Epoch 1, Step 0, Total Loss: 11.0329, MLM Loss: 10.3090, NSP Loss: 0.7240
Epoch 1, Step 10, Total Loss: 11.0875, MLM Loss: 10.3090, NSP Loss: 0.7785
Epoch 1, Step 20, Total Loss: 11.0651, MLM Loss: 10.3090, NSP Loss: 0.7562
Epoch 1, Step 30, Total Loss: 10.9832, MLM Loss: 10.3090, NSP Loss: 0.6742
Epoch 1, Step 40, Total Loss: 11.0547, MLM Loss: 10.3090, NSP Loss: 0.7458
Epoch 1, Step 50, Total Loss: 11.1379, MLM Loss: 10.3090, NSP Loss: 0.8290
Epoch 1, Step 60, Total Loss: 11.2094, MLM Loss: 10.3090, NSP Loss: 0.9005
Epoch 1, Step 70, Total Loss: 11.1596, MLM Loss: 10.3090, NSP Loss: 0.8507
Epoch 1, Step 80, Total Loss: 11.0163, MLM Loss: 10.3090, NSP Loss: 0.7074
Epoch 1, Step 90, Total Loss: 11.0615, MLM Loss: 10.3090, NSP Loss: 0.7526
Epoch 1, Step 100, Total Loss: 11.0120, MLM Loss: 10.3090, NSP Loss: 0.7030
Epoch 1, Step 110, Total Loss: 11.0890, MLM Loss: 10.3089, NSP Loss: 0.7801
Epoch 1, Step 120, Total Loss: 11.0855, MLM Loss: 10.3089, NSP Loss: 0.7765
Epoch 1, Step 130, Tota

2024-05-12 18:16:21.953513: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 2, Step 0, Total Loss: 11.0036, MLM Loss: 10.3088, NSP Loss: 0.6948
Epoch 2, Step 10, Total Loss: 10.9292, MLM Loss: 10.3088, NSP Loss: 0.6204
Epoch 2, Step 20, Total Loss: 11.0623, MLM Loss: 10.3088, NSP Loss: 0.7535
Epoch 2, Step 30, Total Loss: 11.0720, MLM Loss: 10.3087, NSP Loss: 0.7633
Epoch 2, Step 40, Total Loss: 10.9847, MLM Loss: 10.3088, NSP Loss: 0.6759
Epoch 2, Step 50, Total Loss: 10.9817, MLM Loss: 10.3087, NSP Loss: 0.6730
Epoch 2, Step 60, Total Loss: 11.0598, MLM Loss: 10.3087, NSP Loss: 0.7510
Epoch 2, Step 70, Total Loss: 10.9881, MLM Loss: 10.3087, NSP Loss: 0.6794
Epoch 2, Step 80, Total Loss: 11.0788, MLM Loss: 10.3086, NSP Loss: 0.7701
Epoch 2, Step 90, Total Loss: 11.0113, MLM Loss: 10.3086, NSP Loss: 0.7026
Epoch 2, Step 100, Total Loss: 11.0149, MLM Loss: 10.3086, NSP Loss: 0.7062
Epoch 2, Step 110, Total Loss: 11.1322, MLM Loss: 10.3085, NSP Loss: 0.8236
Epoch 2, Step 120, Total Loss: 10.9869, MLM Loss: 10.3085, NSP Loss: 0.6784
Epoch 2, Step 130, Tota

2024-05-12 18:22:21.464438: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 3, Step 0, Total Loss: 10.9019, MLM Loss: 10.2695, NSP Loss: 0.6324
Epoch 3, Step 10, Total Loss: 10.9768, MLM Loss: 10.2712, NSP Loss: 0.7056
Epoch 3, Step 20, Total Loss: 10.9484, MLM Loss: 10.2651, NSP Loss: 0.6833
Epoch 3, Step 30, Total Loss: 10.9837, MLM Loss: 10.2471, NSP Loss: 0.7367
Epoch 3, Step 40, Total Loss: 10.9918, MLM Loss: 10.2748, NSP Loss: 0.7170
Epoch 3, Step 50, Total Loss: 10.9657, MLM Loss: 10.2524, NSP Loss: 0.7132
Epoch 3, Step 60, Total Loss: 11.0250, MLM Loss: 10.2734, NSP Loss: 0.7517
Epoch 3, Step 70, Total Loss: 10.9326, MLM Loss: 10.2591, NSP Loss: 0.6735
Epoch 3, Step 80, Total Loss: 10.9131, MLM Loss: 10.2483, NSP Loss: 0.6648
Epoch 3, Step 90, Total Loss: 10.9206, MLM Loss: 10.2701, NSP Loss: 0.6505
Epoch 3, Step 100, Total Loss: 10.9700, MLM Loss: 10.2643, NSP Loss: 0.7057
Epoch 3, Step 110, Total Loss: 10.9996, MLM Loss: 10.2656, NSP Loss: 0.7340
Epoch 3, Step 120, Total Loss: 10.9620, MLM Loss: 10.2717, NSP Loss: 0.6904
Epoch 3, Step 130, Tota

2024-05-12 18:28:38.523818: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
# def create_bert_pretraining_instances_in_chunks(file_path, chunk_size=1048576, 
#                           doc_boundary_pattern=r'ARTICLE-\d+-\d+-https:\/\/\S+',
#                          test_print=2):   
#     '''
#     the chunk can read more than one documents and hence the buffer can hold more that one doc . Hence the documents can also hold more than one doc
#     but when chunk is smaller than the smallest  doc in file, the documents is essentially end up hoding one doc at a time 
#     '''
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     buffer = ''
#     with open(file_path, 'r', encoding='utf-8') as file:
#         while True:
#             chunk = file.read(chunk_size)
#             if not chunk:
#                 break
#             buffer += chunk
#             # Split buffer by document boundary and keep the content only
#             documents = re.split(doc_boundary_pattern, buffer, flags=re.MULTILINE)            
#             # If the last part might not be a complete document, keep it in the buffer
#             if documents and not re.match(doc_boundary_pattern, documents[-1]):
#                 buffer = documents.pop()
#             else:
#                 buffer = ''            
#             # Process each document found in this chunk
#             for i, doc in enumerate(documents):
#                 if not doc.strip():  # Skip any empty results from split
#                     continue
#                 sentences = nltk.sent_tokenize(doc)
#                 tokens = tokenizer.tokenize(doc)
#                 if test_print > 0:
#                     print(f"First few tokens: {tokens[:10]}") 
#                     print(f"First sentence: {sentences[:3] if sentences else 'No content'}\n")
#                 if i == test_print: return
# file_path = 'wiki_articles_with_seperator.txt'
# create_bert_pretraining_instances_in_chunks(file_path)

In [12]:
# def create_bert_pretraining_instances_in_chunks(file_path, chunk_size=1048576, 
#                           doc_boundary_pattern=r'ARTICLE-\d+-\d+-https:\/\/\S+',
#                          test_print=10, max_seq_length=128, max_predictions_per_seq=20, 
#                          dupe_factor=5, random_seed=12345, nsp_enabled=True):
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     buffer = ''
#     rng = random.Random(random_seed)
#     instances = []
#     with open(file_path, 'r', encoding='utf-8') as file:
#         while True:
#             chunk = file.read(chunk_size)
#             if not chunk:
#                 break
#             buffer += chunk
#             documents = re.split(doc_boundary_pattern, buffer, flags=re.MULTILINE)
#             if documents and not re.match(doc_boundary_pattern, documents[-1]):
#                 buffer = documents.pop()
#             else:
#                 buffer = ''
            
#             for i, doc in enumerate(documents):
#                 if not doc.strip():
#                     continue
#                 tokenized_doc = tokenizer.tokenize(doc)
#                 # Split tokenized document into sequences, not implemented here, assumes tokenized_doc is already split
#                 sequences = [tokenized_doc[i:i+max_seq_length] for i in range(0, len(tokenized_doc), max_seq_length)]
#                 for j in range(len(sequences) - 1):
#                     tokens_a = sequences[j]
#                     if rng.random() > 0.5 or not nsp_enabled:
#                         is_random_next = True
#                         # Randomly pick any sequence from any document
#                         tokens_b = sequences[rng.randint(0, len(sequences) - 1)]
#                     else:
#                         is_random_next = False
#                         # Ensure tokens_b is the next sequence in the same document
#                         tokens_b = sequences[j + 1]
                    
#                     if test_print > 0:
#                         print(f"Tokens A: {tokens_a}, len:: {len(tokens_a) }")
#                         print(f"Tokens B: {tokens_b[:10]}")
#                         print(f"Is random next: {is_random_next}\n")
#                         test_print -= 1
#                         if test_print == 0:
#                             return  # Early exit for test purposes

# file_path = 'wiki_articles_with_seperator.txt'
# res = create_bert_pretraining_instances_in_chunks(file_path)

Tokens A: ['ana', '##rch', '##ism', 'is', 'a', 'political', 'philosophy', 'and', 'movement', 'that', 'is', 'sc', '##ept', '##ical', 'of', 'authority', 'and', 'rejects', 'all', 'involuntary', ',', 'coe', '##rc', '##ive', 'forms', 'of', 'hierarchy', '.', 'ana', '##rch', '##ism', 'calls', 'for', 'the', 'abolition', 'of', 'the', 'state', ',', 'which', 'it', 'holds', 'to', 'be', 'unnecessary', ',', 'und', '##es', '##ira', '##ble', ',', 'and', 'harmful', '.', 'as', 'a', 'historically', 'left', '-', 'wing', 'movement', ',', 'placed', 'on', 'the', 'far', '##thest', 'left', 'of', 'the', 'political', 'spectrum', ',', 'it', 'is', 'usually', 'described', 'alongside', 'communal', '##ism', 'and', 'libertarian', 'marxism', 'as', 'the', 'libertarian', 'wing', '(', 'libertarian', 'socialism', ')', 'of', 'the', 'socialist', 'movement', ',', 'and', 'has', 'a', 'strong', 'historical', 'association', 'with', 'anti', '-', 'capitalism', 'and', 'socialism', '.', 'humans', 'lived', 'in', 'societies', 'without'

In [None]:
# class Human:
#     def __init__(self, name, legs=2):
#         self.name = name
#         self.legs = legs

# class MaleHuman(Human):
#     def __init__(self, *args, **kwargs):
#         super(MaleHuman, self).__init__(*args, **kwargs)
#         # self.beard = beard

# m1 = MaleHuman('Bhujay')
# m1.name

In [None]:
# dataset = dataset.padded_batch(
    #     batch_size, 
    #     padded_shapes={'input_ids': [None], 
    #                    'segment_ids': [128], 
    #                    'masked_lm_positions': [None], 
    #                    'masked_lm_labels': [None], 
    #                    'next_sentence_labels': []})

In [None]:

# try:
#     for idx, (inputs, labels) in enumerate(train_dataset):
#         print(f"instance {idx + 1}:")
#         # print(f"Inputs: {inputs}")
#         # print(f"Labels: {labels}")
#         if idx == 10:  # Limit to 2 batches for demonstration
#             break
# except Exception as e:
#     print(f"Error while processing the dataset: {e}")

In [None]:
# def test_dataset(filepath):
#     # dataset = load_dataset(filepath, batch_size=16)  # Using batch size of 1 for simplicity
#     try:
#         for i, (inputs, labels) in enumerate(train_dataset):        
#             print(f"Successfully parsed entry {i+1}")        
#             if i == 100:  # Optionally limit the number of entries to check
#                 break
#     except Exception as e:
#             print(f'record: {i}  , Err: {e}')

# test_dataset('output.tfrecord')

In [None]:
# def parse_tfrecord(serialized_example):
#     feature_description = {
#         'input_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
#         'segment_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
#         'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
#         'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
#         'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64)
#     }
#     try:
#         example = tf.io.parse_single_example(serialized_example, feature_description)    
#         input_ids = tf.cast(example['input_ids'], tf.int32)
#         segment_ids = tf.cast(example['segment_ids'], tf.int32)
#         # segment_ids = tf.sparse.to_dense(segment_ids, default_value=0)  # Default to 0
#         # segment_ids = tf.reshape(segment_ids, [max_seq_length]) 
#         masked_lm_positions = tf.sparse.to_dense(example['masked_lm_positions'])
#         masked_lm_labels = tf.sparse.to_dense(example['masked_lm_labels'])   
#         next_sentence_labels = tf.cast(example['next_sentence_labels'], tf.int32)
#         inputs = {'input_ids': input_ids, 'segment_ids': segment_ids, }
#         labels = {'masked_lm_positions': masked_lm_positions, 
#                   'mlm_labels': masked_lm_labels, 'nsp_labels': next_sentence_labels}
#     except Exception as e:
#         print(f"Failed to parse example: {e}")
#         # Return dummy/default data to allow pipeline to continue
#         inputs = {
#             'input_ids': tf.constant([0]*args.max_seq_length, dtype=tf.int32),
#             'segment_ids': tf.constant([0]*args.max_seq_length, dtype=tf.int32)
#             }
#         labels = {
#             'masked_lm_positions': tf.constant([-1]*20, dtype=tf.int32),  # Adjust 20 to your max_predictions_per_seq
#             'masked_lm_labels': tf.constant([-1]*20, dtype=tf.int32),
#             'nsp_labels': tf.constant([0], dtype=tf.int32)
#             } 
#         return (None, None)
#     return inputs, labels




# def load_dataset(filepath, batch_size):
#     raw_dataset = tf.data.TFRecordDataset(filepath)
#     parsed_dataset = raw_dataset.map(parse_tfrecord)
#     filtered_dataset = parsed_dataset.filter(lambda x: x[0] is not None and x[1] is not None)
#     batched_dataset = filtered_dataset.batch(batch_size)
#     return batched_dataset
# # Usage
# batch_size = 16
# train_dataset = load_dataset('output.tfrecord', batch_size)

# single_test_instance = iter(train_dataset.take(1)).next()
# single_input_tuple = single_test_instance[0]['input_ids'], single_test_instance[0]['segment_ids']
# print(single_test_instance)
# print()
# # print(single_input_tuple)

In [None]:
# def _parse_function(proto):
#     # Define your tfrecord again. It must be the same as the one used for saving your data.
#     feature_description = {
#         'input_ids': tf.io.FixedLenFeature([128], tf.int64),  # Assuming input_ids are of length 128
#         'segment_ids': tf.io.FixedLenFeature([128], tf.int64),  # Assuming segment_ids are of length 128
#         'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
#         'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
#         'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64),
#     }

#     # Load one example
#     parsed_features = tf.io.parse_single_example(proto, feature_description)
    
#     # Turn your sparse array into a dense array with default values as 0
#     parsed_features['masked_lm_positions'] = tf.sparse.to_dense(parsed_features['masked_lm_positions'], default_value=0)
#     parsed_features['masked_lm_labels'] = tf.sparse.to_dense(parsed_features['masked_lm_labels'], default_value=0)

#     return parsed_features
# # Read the TFRecord file
# def load_dataset(file_path):
#     dataset = tf.data.TFRecordDataset(file_path)
#     dataset = dataset.map(_parse_function)  # Parse the record into tensors.
#     return dataset

# # Path to the TFRecord file
# tfrecord_file_path = 'output.tfrecord'

# # Load the dataset
# parsed_dataset = load_dataset(tfrecord_file_path)
# # Display a few examples from the dataset
# for parsed_record in parsed_dataset.take(2):  # Only take first 5 examples
#     print('Input IDs:', parsed_record['input_ids'].numpy())
#     print('Segment IDs:', parsed_record['segment_ids'].numpy())
#     print('Masked LM Positions:', parsed_record['masked_lm_positions'].numpy())
#     print('Masked LM Labels:', parsed_record['masked_lm_labels'].numpy())
#     print('Next Sentence Label:', parsed_record['next_sentence_labels'].numpy())
#     print('---')


In [None]:
# def parse_tfrecord(serialized_example):
#     feature_description = {
#         'input_ids': tf.io.FixedLenFeature([128], tf.int64),
#         'segment_ids': tf.io.FixedLenFeature([128], tf.int64),
#         'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
#         'masked_lm_labels': tf.io.VarLenFeature(tf.int64),
#         'next_sentence_labels': tf.io.FixedLenFeature([], tf.int64)
#     }
#     example = tf.io.parse_single_example(serialized_example, feature_description)

#     input_ids = tf.cast(example['input_ids'], tf.int32)
#     segment_ids = tf.cast(example['segment_ids'], tf.int32)
#     masked_lm_positions = tf.sparse.to_dense(example['masked_lm_positions'])
#     masked_lm_labels = tf.sparse.to_dense(example['masked_lm_labels'])
#     next_sentence_labels = tf.cast(example['next_sentence_labels'], tf.int32)

#     inputs = {'input_ids': input_ids, 'segment_ids': segment_ids}
#     labels = {'mlm_output': masked_lm_labels, 'nsp_output': next_sentence_labels}

#     return inputs, labels

In [None]:
# optimizer = Adam(learning_rate=2e-5)
# loss = {
#     'mlm_output': SparseCategoricalCrossentropy(from_logits=True),
#     'nsp_output': BinaryCrossentropy(from_logits=True)
# }
# metrics = {
#     'mlm_output': 'accuracy',
#     'nsp_output': 'accuracy'
# }



# def masked_sparse_categorical_crossentropy(y_true, y_pred):
#     # Create a mask to ignore `-1` in labels for loss calculation
#     mask = tf.cast(tf.not_equal(y_true, -1), tf.float32)
#     # Compute sparse categorical crossentropy loss
#     loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
#     # Apply the mask
#     loss *= mask
#     # Calculate mean loss only over non-masked elements
#     return tf.reduce_sum(loss) / tf.reduce_sum(mask)

# # Ensure the custom loss function is used when compiling the model
# bert_model.compile(
#     optimizer=optimizer,
#     loss={'mlm_output': masked_sparse_categorical_crossentropy, 'nsp_output': BinaryCrossentropy(from_logits=True)},
#     metrics=metrics
# )
# # epochs = 3 
# # bert_model.fit(train_dataset, epochs=epochs)

In [None]:
# class SmallBERT(Model):
#     def __init__(self, vocab_size, num_layers, d_model, num_heads, dff, max_pos=512, rate=0.1):
#         super(SmallBERT, self).__init__()
#         self.token_embedding = Embedding(vocab_size, d_model)
#         self.position_embedding = Embedding(max_pos, d_model)
#         self.segment_embedding = Embedding(2, d_model)  # Only 2 segments assumed
        
#         self.enc_layers = [TransformerEncoderV3(num_layers=1, d_model=d_model, num_heads=num_heads, dff=dff,
#                                                 vocab_size=vocab_size, max_pos=max_pos) for _ in range(num_layers)]
        
#         self.dropout = Dropout(rate)
#         self.final_layer = Dense(vocab_size)  # Prediction layer for MLM
#         self.nsp_classifier = Dense(2, activation='softmax')  # NSP output

#     def call(self, input_ids, segment_ids, training=False):
#         seq_length = tf.shape(input_ids)[1]
#         position_ids = tf.range(seq_length)
        
#         x = self.token_embedding(input_ids) + self.position_embedding(position_ids) + self.segment_embedding(segment_ids)
#         x = self.dropout(x, training=training)
        
#         for encoder in self.enc_layers:
#             x = encoder(x)
        
#         logits = self.final_layer(x)
#         pooled_output = self.dropout(sequence_output[:, 0, :], training=training)  # Use the output of the [CLS] token
#         nsp_output = self.nsp_classifier(pooled_output)
#         return logits, nsp_output  

# vocab_size = 10000  # Smaller vocabulary size for simplicity
# num_layers = 2  # Fewer layers
# d_model = 128  # Smaller dimensionality
# num_heads = 4
# dff = 512
# small_bert = SmallBERT(vocab_size, num_layers, d_model, num_heads, dff)