In [1]:
!pip install numpy==1.26.0
!pip install torch==2.2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install torchtext==0.17.2
!pip install torchdata==0.7.1
!pip install portalocker==2.8.2
!pip install pandas==2.2.1
!pip install transformers==4.35.2

Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m106.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but y

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.2.0
  Downloading https://download.pytorch.org/whl/cpu/torch-2.2.0%2Bcpu-cp311-cp311-linux_x86_64.whl (186.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.8/186.8 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.22.1%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.22.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.20.1%2Bcpu-cp311-cp311-linux_x86_64.whl (1.8 MB)
[2K     [90m━━━

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab
from torch import Tensor
from torch.nn import Transformer
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from itertools import chain
from itertools import islice
from torchtext.datasets import IMDB
from copy import deepcopy
import random
import csv
import json
from tqdm import tqdm
import pandas as pd

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
# we will first perform basic preprocessing that we do for all NLP tasks: Tokenization and vocabulary building:
# Tokenization:

#tokenizer is initialized to tokenize text using basic English tokenization rules, converting text samples into lists of tokens.

#yield_tokens is a generator function that iterates through the data, yielding tokenized versions of the text samples. This function facilitates vocabulary building by providing a stream of tokens.

#word_dict defines special tokens used in text processing, such as padding [PAD], class [CLS], separator [SEP], mask [MASK], and unknown [UNK] tokens, with their corresponding indices.

#Special symbols and their indices are explicitly defined for clarity and used throughout data preparation.

#text_to_index and index_to_en functions are utility converters.
#The former converts text into a list of numerical indices based on the vocabulary, and the latter reverses this process, translating a sequence of indices back into readable English text.

#CLS (Classification Token): This token serves as the Start of Sentence (SOS) marker.
#It represents the overall meaning of the entire sentence. Commonly used in tasks that require understanding the entire input, like classification.

#SEP (Separator Token): Used as the End of Sentence (EOS) marker.
#It also acts as a delimiter in scenarios where a model needs to understand and differentiate between multiple sentences, like in question-answering or sentence-pair tasks.

#PAD (Padding Token): This token is added to sequences to ensure all inputs are of equal length.
#During training, it's important to note that the [PAD] token, typically with an ID of 0, does not contribute to the gradient calculations.

#MASK (Masked Token): Utilized for word replacement in tasks like masked language modeling.
# It allows models to predict the identity of masked-out words, facilitating learning of bidirectional representations.

#UNK (Unknown Token): Acts as a placeholder for words that are not found in the tokenizer's vocabulary.
#This token replaces any unknown or out-of-vocabulary item in the input data.

## Vocab Building:

#This section focuses on building the vocabulary from the given dataset.
#The vocabulary is built using the build_vocab_from_iterator function, incorporating special symbols ([PAD], [CLS], [SEP], [MASK], [UNK]) at the beginning.
#The UNK_IDX is set as the default index for unknown words, and the total vocabulary size is printed.


In [3]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for label, data_sample in data_iter:
        yield tokenizer(data_sample)

# Define special symbols and indices
PAD_IDX,CLS_IDX, SEP_IDX,  MASK_IDX,UNK_IDX= 0, 1, 2, 3, 4

# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['[PAD]','[CLS]', '[SEP]','[MASK]','[UNK]']

In [4]:
#create data splits
train_iter, test_iter = IMDB(split=('train', 'test'))
all_data_iter = chain(train_iter, test_iter)
#check tokenizer
# list(yield_tokens(all_data_iter))[5][:20]
fifth_item_tokens = next(islice(yield_tokens(all_data_iter), 5, None))
print(fifth_item_tokens[:20])

['i', 'would', 'put', 'this', 'at', 'the', 'top', 'of', 'my', 'list', 'of', 'films', 'in', 'the', 'category', 'of', 'unwatchable', 'trash', '!', 'there']


In [5]:
#create vocab : vocab is only built using train data
vocab=build_vocab_from_iterator(yield_tokens(all_data_iter),specials=special_symbols,special_first=True)

vocab.set_default_index(UNK_IDX)
VOCAB_SIZE=len(vocab)
print(VOCAB_SIZE)

147150


In [6]:
text_to_index=lambda text: [vocab(token) for token in tokenizer(text)]
index_to_en = lambda seq_en: " ".join([vocab.get_itos()[index] for index in seq_en])

In [7]:
seq_en = [0, 1, 2, 3, 4, 5, 6]  # Example input sequence
english_sentence = index_to_en(seq_en)
print(english_sentence)
seq2=[6,16,26131]
english_sentence = index_to_en(seq2)

print(english_sentence)

text = "I've seen R-rated films with male nudity. Nowhere, because they don't exist."  # Example input text
text_to_index = lambda text: [vocab[token] for token in tokenizer(text)]
index_sequence = text_to_index(text)

print(index_sequence)

[PAD] [CLS] [SEP] [MASK] [UNK] the .
. i speculative
[16, 12, 149, 119, 11363, 117, 22, 928, 1047, 6, 1251, 7, 96, 42, 99, 12, 30, 1877, 6]


In [8]:
# Now that we have the vocab, the next stem is masking for BERT, with BERT we want to carry out next sentence prediction (NSP), and
#masked language modelling (MLM).
# here we dont want to utlilize all of the data in masking becuase we will be deviating from the groundtruth, so first we will define a Bernauli_true_false function whcih will generate
# true or false based on the probability given as input.
# we want to utilize around 20% data for masking, with in which there is 50 percent chance the token will be masked and 50% chance token will be unchanged.
# we will wither replace the  bert label with the true token for the masked input token or with some random token for better training
# Lets ROll!

In [9]:
#The Masking function applies BERT's MLM strategy, deciding whether each token in a sequence should be masked, left unchanged, or replaced with a random token.
# This process is essential for training the model to predict masked words based on their context.

#First, define a function that returns random 0/1 from bernouli distribution for random sampling.

def bernoulli_true_false(p):
    # Create a Bernoulli distribution with probability p
    bernoulli_dist = torch.distributions.Bernoulli(torch.tensor([p]))
    # Sample from this distribution and convert 1 to True and 0 to False
    return bernoulli_dist.sample().item() == 1

In [10]:
# define the masking function

def Masking(token):
    # Decide whether to mask this token (20% chance)
    mask = bernoulli_true_false(0.2)

    # If mask is False, immediately return with '[PAD]' label
    if not mask:
        return token, '[PAD]'

    # If mask is True, proceed with further operations
    # Randomly decide on an operation (50% chance each)
    random_opp = bernoulli_true_false(0.5)
    random_swich = bernoulli_true_false(0.5)

    # Case 1: If mask, random_opp, and random_swich are True
    if mask and random_opp and random_swich:
        # Replace the token with '[MASK]' and set label to a random token
        mask_label = index_to_en(torch.randint(0, VOCAB_SIZE, (1,)))
        token_ = '[MASK]'

    # Case 2: If mask and random_opp are True, but random_swich is False
    elif mask and random_opp and not random_swich:
        # Leave the token unchanged and set label to the same token
        token_ = token
        mask_label = token

    # Case 3: If mask is True, but random_opp is False
    else:
        # Replace the token with '[MASK]' and set label to the original token
        token_ = '[MASK]'
        mask_label = token

    return token_, mask_label

In [11]:
# lets check the masking for a sample

torch.manual_seed(100)
for l in range(20):
  token="apple"
  token_,label=Masking(token)
  if token==token_ and label=="[PAD]":
    print(token_,label,f"\t Actual token *{token}* is left unchanged")
  elif token_=="[MASK]" and label==token:
    print(token_,label,f"\t Actual token *{token}* is masked with '{token_}'")
  else:
    print(token_,label,f"\t Actual token *{token}* is replaced with random token #{label}#")

[MASK] apple 	 Actual token *apple* is masked with '[MASK]'
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
[MASK] whored 	 Actual token *apple* is replaced with random token #whored#
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
[MASK] apple 	 Actual token *apple* is masked with '[MASK]'
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PAD] 	 Actual token *apple* is left unchanged
apple [PA

In [12]:
# now lets prepare data for MLM
#prepare_for_mlm prepares tokenized text for MLM training by applying the masking strategy.
# It returns sequences of masked tokens along with their corresponding labels, optionally including the original (raw) tokens for reference.

def prepare_for_mlm(tokens, include_raw_tokens=False):
    """
    Prepares tokenized text for BERT's Masked Language Model (MLM) training.

    """
    bert_input = []  # List to store sentences processed for BERT's MLM
    bert_label = []  # List to store labels for each token (mask, random, or unchanged)
    raw_tokens_list = []  # List to store raw tokens if needed
    current_bert_input = []
    current_bert_label = []
    current_raw_tokens = []

    for token in tokens:
        # Apply BERT's MLM masking strategy to the token
        masked_token, mask_label = Masking(token)

        # Append the processed token and its label to the current sentence and label list
        current_bert_input.append(masked_token)
        current_bert_label.append(mask_label)

        # If raw tokens are to be included, append the original token to the current raw tokens list
        if include_raw_tokens:
            current_raw_tokens.append(token)

        # Check if the token is a sentence delimiter (., ?, !)
        if token in ['.', '?', '!']:
            # If current sentence has more than two tokens, consider it a valid sentence
            if len(current_bert_input) > 2:
                bert_input.append(current_bert_input)
                bert_label.append(current_bert_label)
                # If including raw tokens, add the current list of raw tokens to the raw tokens list
                if include_raw_tokens:
                    raw_tokens_list.append(current_raw_tokens)

                # Reset the lists for the next sentence
                current_bert_input = []
                current_bert_label = []
                current_raw_tokens = []
            else:
                # If the current sentence is too short, discard it and reset lists
                current_bert_input = []
                current_bert_label = []
                current_raw_tokens = []

    # Add any remaining tokens as a sentence if there are any
    if current_bert_input:
        bert_input.append(current_bert_input)
        bert_label.append(current_bert_label)
        if include_raw_tokens:
            raw_tokens_list.append(current_raw_tokens)

    # Return the prepared lists for BERT's MLM training
    return (bert_input, bert_label, raw_tokens_list) if include_raw_tokens else (bert_input, bert_label)

In [13]:
# lets check!

torch.manual_seed(100)
original_input="The sun sets behind the distant mountains."
tokens=tokenizer(original_input)
bert_input, bert_label= prepare_for_mlm(tokens, include_raw_tokens=False)
print("Without raw tokens: \t ","\n \t original_input is: \t ", original_input,"\n \t bert_input is: \t ", bert_input,"\n \t bert_label is: \t ", bert_label)
print("-"*200)
torch.manual_seed(100)
bert_input, bert_label, raw_tokens_list= prepare_for_mlm(tokens, include_raw_tokens=True)
print("With raw tokens: \t ","\n \t original_input is: \t ", original_input,"\n \t bert_input is: \t ", bert_input,"\n \t bert_label is: \t ", bert_label,"\n \t raw_tokens_list is: \t ", raw_tokens_list)

Without raw tokens: 	  
 	 original_input is: 	  The sun sets behind the distant mountains. 
 	 bert_input is: 	  [['[MASK]', 'sun', 'sets', 'behind', 'the', '[MASK]', 'mountains', '.']] 
 	 bert_label is: 	  [['the', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'whored', '[PAD]', '[PAD]']]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
With raw tokens: 	  
 	 original_input is: 	  The sun sets behind the distant mountains. 
 	 bert_input is: 	  [['[MASK]', 'sun', 'sets', 'behind', 'the', '[MASK]', 'mountains', '.']] 
 	 bert_label is: 	  [['the', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'whored', '[PAD]', '[PAD]']] 
 	 raw_tokens_list is: 	  [['the', 'sun', 'sets', 'behind', 'the', 'distant', 'mountains', '.']]


In [14]:
#Now for NSP
#process_for_nsp prepares data for the NSP task by creating pairs of sentences.
# It labels these pairs to indicate whether the second sentence is the subsequent sentence in the original text, facilitating the model's learning of sentence relationships.

#it will either choose counsecutive sentences as and say isnext =1 (true), or choose to random sentences and set isnext as 0 for that particular sample

def process_for_nsp(input_sentences, input_masked_labels):
    """
    Prepares data for Next Sentence Prediction (NSP) task in BERT training.

    Args:
    input_sentences (list): List of tokenized sentences.
    input_masked_labels (list): Corresponding list of masked labels for the sentences.

    Returns:
    bert_input (list): List of sentence pairs for BERT input.
    bert_label (list): List of masked labels for the sentence pairs.
    is_next (list): Binary label list where 1 indicates 'next sentence' and 0 indicates 'not next sentence'.
    """
    if len(input_sentences) < 2:
       raise ValueError("must have two same number of items.")


    # Verify that both input lists are of the same length and have a sufficient number of sentences
    if len(input_sentences) != len(input_masked_labels):
        raise ValueError("Both lists must have the same number of items.")

    bert_input = []
    bert_label = []
    is_next = []

    available_indices = list(range(len(input_sentences)))

    while len(available_indices) >= 2:
        if random.random() < 0.5:
            # Choose two consecutive sentences to simulate the 'next sentence' scenario
            index = random.choice(available_indices[:-1])  # Exclude the last index
            # append list and add  '[CLS]' and  '[SEP]' tokens
            bert_input.append([['[CLS]']+input_sentences[index]+ ['[SEP]'],input_sentences[index + 1]+ ['[SEP]']])
            bert_label.append([['[PAD]']+input_masked_labels[index]+['[PAD]'], input_masked_labels[index + 1]+ ['[PAD]']])
            is_next.append(1)  # Label 1 indicates these sentences are consecutive

            # Remove the used indices
            available_indices.remove(index)
            if index + 1 in available_indices:
                available_indices.remove(index + 1)
        else:
            # Choose two random distinct sentences to simulate the 'not next sentence' scenario
            indices = random.sample(available_indices, 2)
            bert_input.append([['[CLS]']+input_sentences[indices[0]]+['[SEP]'],input_sentences[indices[1]]+ ['[SEP]']])
            bert_label.append([['[PAD]']+input_masked_labels[indices[0]]+['[PAD]'], input_masked_labels[indices[1]]+['[PAD]']])
            is_next.append(0)  # Label 0 indicates these sentences are not consecutive

            # Remove the used indices
            available_indices.remove(indices[0])
            available_indices.remove(indices[1])



    return bert_input, bert_label, is_next

In [15]:
#lets check

#flatten the tensor
flatten = lambda l: [item for sublist in l for item in sublist]
# Sample input sentences
input_sentences = [["i", "love", "apples"], ["she", "enjoys", "reading", "books"], ["he", "likes", "playing", "guitar"]]
# Create masked labels for the sentences
input_masked_labels=[]
for sentence in input_sentences:
  _, current_masked_label= prepare_for_mlm(sentence, include_raw_tokens=False)
  print("CLM: ",current_masked_label )
  print("flatten: ",flatten(current_masked_label) )

  input_masked_labels.append(flatten(current_masked_label))
# Create NSP pairs and labels
random.seed(100)
bert_input, bert_label, is_next = process_for_nsp(input_sentences, input_masked_labels)

# Print the output
print("BERT Input:")
for pair in bert_input:
    print(pair)
print("BERT Label:")
for pair in bert_label:
    print(pair)
print("Is Next: ", is_next)
print("-"*200)
random.seed(1000)
bert_input, bert_label, is_next = process_for_nsp(input_sentences, input_masked_labels)

# Print the output
print("BERT Input:")
for pair in bert_input:
    print(pair)
print("BERT Label:")
for pair in bert_label:
    print(pair)
print("Is Next: ", is_next)


CLM:  [['[PAD]', '[PAD]', '[PAD]']]
flatten:  ['[PAD]', '[PAD]', '[PAD]']
CLM:  [['[PAD]', '[PAD]', '[PAD]', '[PAD]']]
flatten:  ['[PAD]', '[PAD]', '[PAD]', '[PAD]']
CLM:  [['he', '[PAD]', '[PAD]', '[PAD]']]
flatten:  ['he', '[PAD]', '[PAD]', '[PAD]']
BERT Input:
[['[CLS]', 'she', 'enjoys', 'reading', 'books', '[SEP]'], ['he', 'likes', 'playing', 'guitar', '[SEP]']]
BERT Label:
[['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['he', '[PAD]', '[PAD]', '[PAD]', '[PAD]']]
Is Next:  [1]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
BERT Input:
[['[CLS]', 'he', 'likes', 'playing', 'guitar', '[SEP]'], ['i', 'love', 'apples', '[SEP]']]
BERT Label:
[['[PAD]', 'he', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]']]
Is Next:  [0]


In [16]:
#Finalize the bert input

#prepare_bert_final_inputs consolidates the prepared data for MLM and NSP into a format suitable for BERT training,
#including converting tokens to indices, padding sequences for uniform length, and generating segment labels to distinguish between pairs of sentences.
#This function is the final step in preparing data for BERT, ensuring it is in the correct format for effective model training.


def prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts,to_tenor=True):
    """
    Prepare the final input lists for BERT training.
    """
    def zero_pad_list_pair(pair_, pad='[PAD]'):
        pair=deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0], pair[1]

    #flatten the tensor
    flatten = lambda l: [item for sublist in l for item in sublist]
    #transform tokens to vocab indices
    tokens_to_index=lambda tokens: [vocab[token] for token in tokens]

    bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = [], [], [], []

    for bert_input, bert_label,is_next in zip(bert_inputs, bert_labels,is_nexts):
        # Create segment labels for each pair of sentences
        segment_label = [[1] * len(bert_input[0]), [2] * len(bert_input[1])]

        # Zero-pad the bert_input and bert_label and segment_label
        bert_input_padded = zero_pad_list_pair(bert_input)
        bert_label_padded = zero_pad_list_pair(bert_label)
        segment_label_padded = zero_pad_list_pair(segment_label,pad=0)

        #convert to tensors
        if to_tenor:

            # Flatten the padded inputs and labels, transform tokens to their corresponding vocab indices, and convert them to tensors
            bert_inputs_final.append(torch.tensor(tokens_to_index(flatten(bert_input_padded)),dtype=torch.int64))
            #bert_labels_final.append(torch.tensor(tokens_to_index(flatten(bert_label_padded)),dtype=torch.int64))
            bert_labels_final.append(torch.tensor(tokens_to_index(flatten(bert_label_padded)),dtype=torch.int64))
            segment_labels_final.append(torch.tensor(flatten(segment_label_padded),dtype=torch.int64))
            is_nexts_final.append(is_next)

        else:
          # Flatten the padded inputs and labels
            bert_inputs_final.append(flatten(bert_input_padded))
            bert_labels_final.append(flatten(bert_label_padded))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final


In [17]:
# lets check using the bert_input, bert_label and is_next from previous example:

bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final=prepare_bert_final_inputs(bert_input, bert_label, is_next,to_tenor=True)
torch.set_printoptions(linewidth=10000)# this assures that whole output is printed in one line
print("input:\t\t",bert_input,"\ninputs_final:\t",bert_inputs_final,"\nbert labels final:\t",bert_labels_final,"\nsegment labels final:\t",segment_labels_final,"\nis nexts final:\t",is_nexts_final)

input:		 [[['[CLS]', 'he', 'likes', 'playing', 'guitar', '[SEP]'], ['i', 'love', 'apples', '[SEP]']]] 
inputs_final:	 [tensor([    1,    33,  1155,   404,  4833,     2,    16,   123, 14227,     2,     0,     0])] 
bert labels final:	 [tensor([ 0, 33,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])] 
segment labels final:	 [tensor([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0])] 
is nexts final:	 [0]


In [18]:
#Sentences are zero-padded and each token is mapped to its vocab index([CLS]>>1, he>>33, ..., [SEP]>>2,[PAD]>>0])

In [19]:
#Mask labels are also padded and mapped to vocab indices. In this case, all tokens are **unchanged** except the token, `he` which is masked:

print("input:\t\t",bert_input,"\nmask_label:\t",bert_label, "\nlabels_final: \t",bert_labels_final)


input:		 [[['[CLS]', 'he', 'likes', 'playing', 'guitar', '[SEP]'], ['i', 'love', 'apples', '[SEP]']]] 
mask_label:	 [[['[PAD]', 'he', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[PAD]', '[PAD]', '[PAD]', '[PAD]']]] 
labels_final: 	 [tensor([ 0, 33,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])]


In [20]:
#Finally, segment labels are created, where tokens of the first sentence are labeled with 1, tokens of the second sentence are labeled with 2 and zero-paddings are labeled with 0.


In [21]:
print("\ninputs_final:\t",bert_inputs_final,"\nsegment_labels:\t",segment_labels_final)


inputs_final:	 [tensor([    1,    33,  1155,   404,  4833,     2,    16,   123, 14227,     2,     0,     0])] 
segment_labels:	 [tensor([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0])]


In [22]:
# so finally, we will need this data as csv file to use it as input for the BERT training,
# this is a long process might take 2-3 hrs!

#A CSV file is created to store the data set prepared for BERT training and testing.
#Each row contains the original text, BERT inputs, labels, segment labels, and the NSP task label.

#The data from the IMDB data set is tokenized, processed for MLM, and then for NSP.
#The results are formatted and written to the CSV file, providing a comprehensive data set for BERT model training.

#This process is critical for ensuring the data is in the right format for effective training of BERT on the IMDB data set,
#focusing on understanding text context and relationships between sentences.

In [25]:
next(iter(train_iter))

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [30]:
# in order to save the time for computations, the following code has a counter variable it will stop automatically after 100 counts
# to preprocess the entire data set remove the counter

In [31]:
counter =0

csv_file_path ='train_bert_data_new_sample.csv'
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['Original Text', 'BERT Input', 'BERT Label', 'Segment Label', 'Is Next'])


    # Wrap train_iter with tqdm for a progress bar
    for n, (_, sample) in enumerate(tqdm(train_iter, desc="Processing samples")):
        if counter==101:
              break
        print(counter)
        # Tokenize the sample input
        tokens = tokenizer(sample)
        # Create MLM inputs and labels
        bert_input, bert_label = prepare_for_mlm(tokens, include_raw_tokens=False)
        if len(bert_input) < 2:
            continue
        # Create NSP pairs, token labels, and is_next label
        bert_inputs, bert_labels, is_nexts = process_for_nsp(bert_input, bert_label)
        # add zero-paddings, map tokens to vocab indices and create segment labels
        bert_inputs, bert_labels, segment_labels, is_nexts = prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts)
        # convert tensors to lists, convert lists to JSON-formatted strings
        for bert_input, bert_label, segment_label, is_next in zip(bert_inputs, bert_labels, segment_labels, is_nexts):
            bert_input_str = json.dumps(bert_input.tolist())
            bert_label_str = json.dumps(bert_label.tolist())
            segment_label_str = ','.join(map(str, segment_label.tolist()))
            # Write the data to a CSV file row-by-row
            csv_writer.writerow([sample, bert_input_str, bert_label_str, segment_label_str, is_next])



        counter+=1


Processing samples: 0it [00:00, ?it/s]

0


Processing samples: 1it [00:00,  2.43it/s]

1


Processing samples: 2it [00:00,  2.30it/s]

2


Processing samples: 4it [00:01,  3.68it/s]

3
4


Processing samples: 6it [00:01,  4.11it/s]

5
6


Processing samples: 7it [00:01,  4.74it/s]

7


Processing samples: 8it [00:02,  4.29it/s]

8


Processing samples: 9it [00:02,  2.95it/s]

9


Processing samples: 10it [00:03,  3.07it/s]

10


Processing samples: 12it [00:03,  3.46it/s]

11
12


Processing samples: 13it [00:03,  3.75it/s]

13


Processing samples: 14it [00:04,  3.63it/s]

14


Processing samples: 15it [00:04,  2.97it/s]

15


Processing samples: 16it [00:04,  3.09it/s]

16
17


Processing samples: 19it [00:06,  2.77it/s]

18
19


Processing samples: 20it [00:06,  3.07it/s]

20


Processing samples: 22it [00:06,  3.72it/s]

21
22


Processing samples: 23it [00:07,  3.72it/s]

23


Processing samples: 24it [00:07,  3.56it/s]

24


Processing samples: 26it [00:07,  4.41it/s]

25
26


Processing samples: 28it [00:08,  4.43it/s]

27
28


Processing samples: 29it [00:08,  4.27it/s]

29


Processing samples: 30it [00:08,  3.54it/s]

30


Processing samples: 32it [00:09,  4.11it/s]

31
32


Processing samples: 34it [00:09,  5.13it/s]

33
34


Processing samples: 37it [00:10,  4.25it/s]

35
36
37


Processing samples: 38it [00:10,  4.01it/s]

38


Processing samples: 40it [00:11,  4.54it/s]

39
40


Processing samples: 42it [00:11,  4.63it/s]

41
42


Processing samples: 43it [00:13,  1.46it/s]

43


Processing samples: 44it [00:13,  1.76it/s]

44


Processing samples: 46it [00:14,  2.07it/s]

45
46


Processing samples: 48it [00:14,  3.14it/s]

47
48


Processing samples: 50it [00:15,  4.22it/s]

49
50


Processing samples: 52it [00:15,  4.41it/s]

51
52


Processing samples: 53it [00:16,  2.89it/s]

53


Processing samples: 54it [00:16,  2.58it/s]

54


Processing samples: 55it [00:17,  2.56it/s]

55


Processing samples: 56it [00:17,  2.49it/s]

56


Processing samples: 58it [00:17,  3.52it/s]

57
58
59


Processing samples: 60it [00:18,  5.22it/s]

60


Processing samples: 61it [00:18,  4.42it/s]

61


Processing samples: 62it [00:18,  4.52it/s]

62


Processing samples: 63it [00:18,  4.48it/s]

63


Processing samples: 65it [00:19,  4.49it/s]

64
65


Processing samples: 66it [00:19,  3.60it/s]

66


Processing samples: 68it [00:20,  3.88it/s]

67
68


Processing samples: 69it [00:20,  4.47it/s]

69


Processing samples: 71it [00:21,  2.91it/s]

70
71
72


Processing samples: 74it [00:21,  4.45it/s]

73
74


Processing samples: 76it [00:22,  4.34it/s]

75
76


Processing samples: 78it [00:22,  4.76it/s]

77
78
79


Processing samples: 80it [00:23,  3.60it/s]

80


Processing samples: 81it [00:25,  1.61it/s]

81


Processing samples: 82it [00:25,  1.91it/s]

82


Processing samples: 83it [00:26,  1.80it/s]

83


Processing samples: 84it [00:26,  2.10it/s]

84


Processing samples: 85it [00:26,  2.27it/s]

85


Processing samples: 86it [00:26,  2.62it/s]

86


Processing samples: 87it [00:27,  2.74it/s]

87


Processing samples: 88it [00:28,  2.08it/s]

88


Processing samples: 89it [00:28,  2.21it/s]

89


Processing samples: 90it [00:29,  2.04it/s]

90


Processing samples: 91it [00:29,  2.12it/s]

91


Processing samples: 92it [00:29,  2.50it/s]

92


Processing samples: 95it [00:31,  2.68it/s]

93
94
95


Processing samples: 96it [00:31,  2.19it/s]

96


Processing samples: 98it [00:32,  2.76it/s]

97
97


Processing samples: 99it [00:32,  2.41it/s]

98


Processing samples: 101it [00:33,  3.18it/s]

99
100


Processing samples: 102it [00:33,  3.04it/s]


In [32]:
# in the case above we have developed vocab manually from scratch, we can also use
#  Hugging Face's pretrained BERT tokenizer  from transformers library to which is pre configured with vocabulary:

from transformers import BertTokenizer

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [33]:
def yield_tokens(data_iter):
    for _, data_sample in data_iter:
        tokens = tokenizer(data_sample, return_tensors='pt', truncation=True, max_length=512)['input_ids'][0]
        yield tokens.tolist()


In [34]:
from torchtext.data.functional import to_map_style_dataset
from torchtext.datasets import IMDB

# Define special symbols and indices
PAD_IDX, CLS_IDX, SEP_IDX, MASK_IDX, UNK_IDX = tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.mask_token_id, tokenizer.unk_token_id
special_symbols = ['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']

# Load IMDB dataset
train_iter, test_iter = IMDB(split=('train', 'test'))

# Convert to map-style datasets to be compatible with transformers' tokenizers
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Since you are using a pre-trained tokenizer, you don't need to build the vocab from scratch.
# Instead, you can directly use the tokenizer's vocab.
VOCAB_SIZE = len(tokenizer)

print("Vocabulary Size:", VOCAB_SIZE)

Vocabulary Size: 30522
