In [1]:
import torch
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, AdamW
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load data from text files in the folder
def load_data_from_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            data.append(text)
    return data

In [3]:
# Function to perform MLM on input tensor
def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)  # Masking 15% of tokens with value greater than 2 (special tokens)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero())
        tensor[i, selection] = 4  # Replace masked tokens with the <mask> token (token_id=4)
    return tensor



In [12]:
# Load your local dataset (text files) from a folder
data_folder = "C:\\Users\\Dell\\Desktop\\Arun\\BERT_pretrain\\pubmed_data\\"
text_data = load_data_from_folder(data_folder)


In [15]:
# Initialize the tokenizer and create MLM input tensors
tokenizer = RobertaTokenizerFast.from_pretrained('microsoft/codebert-base-mlm', model_max_length=512)
random.shuffle(text_data)  # Shuffle the data for better training
input_ids = tokenizer(text_data, padding='max_length', truncation=True, return_tensors='pt')['input_ids']
input_ids_mlm = mlm(input_ids.detach().clone())


Downloading (…)okenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.08MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.12MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<?, ?B/s] 
Downloading (…)lve/main/config.json: 100%|██████████| 504/504 [00:00<?, ?B/s] 


In [16]:
# Prepare DataLoader
encodings = {'input_ids': input_ids_mlm, 'attention_mask': (input_ids > 0).int(), 'labels': input_ids}
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)


In [18]:
# Initialize the model
model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


Downloading pytorch_model.bin: 100%|██████████| 501M/501M [00:27<00:00, 17.9MB/s] 
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [19]:
# Training loop
model.train()
optim = AdamW(model.parameters(), lr=1e-4)
epochs = 2

for epoch in range(epochs):
    for batch in loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()



In [20]:
# Save the trained model and tokenizer
model.save_pretrained('mlm_model')
tokenizer.save_pretrained('mlm_model')

('mlm_model\\tokenizer_config.json',
 'mlm_model\\special_tokens_map.json',
 'mlm_model\\vocab.json',
 'mlm_model\\merges.txt',
 'mlm_model\\added_tokens.json',
 'mlm_model\\tokenizer.json')

In [21]:
# Reload the trained model and tokenizer
model = RobertaForMaskedLM.from_pretrained('mlm_model')
tokenizer = RobertaTokenizerFast.from_pretrained('mlm_model')

In [22]:
# Function to generate masked predictions
def generate_predictions(texts, tokenizer, model, mask_token="[MASK]"):
    inputs = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate predictions for masked tokens
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_masked_tokens = torch.argmax(logits, dim=2)

    # Decode the predicted masked tokens
    decoded_texts = []
    for i in range(len(texts)):
        text = tokenizer.decode(predicted_masked_tokens[i])
        masked_text = texts[i].replace(mask_token, text)
        decoded_texts.append(masked_text)

    return decoded_texts

In [25]:

# Test the MLM model on new text samples
new_texts = [
    "This is a test sentence. [MASK] is amazing!",
    "A Qualitative [MASK] of Current Practice",
    "I love [MASK] with my friends.",
]

In [26]:
# Generate masked predictions
decoded_texts = generate_predictions(new_texts, tokenizer, model)

# Print the generated masked predictions
for i, text in enumerate(new_texts):
    print("Original Text:", text)
    print("Generated Masked Text:", decoded_texts[i])
    print()

Original Text: This is a test sentence. [MASK] is amazing!
Generated Masked Text: This is a test sentence. 
This is a test sentence. [MASK] is awesome!
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #