<a href="https://colab.research.google.com/github/Daksh024/NSP/blob/master/TrainingBERTMultilinguar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install transformers



In [13]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForMaskedLM, AdamW

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:

# Load and tokenize text corpus from a file
corpus_file_path = "/content/drive/MyDrive/tinyCorpus.txt"
with open(corpus_file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()


In [15]:

data = []
for line in lines:
    tokens = tokenizer.tokenize(line)
    for i in range(len(tokens) - 1):
        data.append((tokens[i], tokens[i+1]))


In [16]:

# Define custom dataset
class NextWordDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_token, target_token = self.data[idx]
        input_ids = self.tokenizer.convert_tokens_to_ids(input_token)
        target_token_id = self.tokenizer.convert_tokens_to_ids(target_token)
        return torch.tensor(input_ids), torch.tensor(target_token_id)

# Create DataLoader for the dataset
train_dataset = NextWordDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()





In [None]:

# # Fine-tuning loop
# num_epochs = 10
# for batch in train_dataloader:
#     optimizer.zero_grad()
#     input_ids, target_ids = batch

#     # Ensure input_ids is a 2D tensor
#     input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

#     outputs = model(input_ids)[0]
#     loss = loss_fn(outputs.view(-1, outputs.shape[-1]), target_ids)
#     loss.backward()
#     optimizer.step()


import time

num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Start the timer
        start_time = time.time()

        optimizer.zero_grad()
        input_ids, target_ids = batch

        # Move data to GPU
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)

        # print(input_ids)

        # Ensure input_ids is a 2D tensor
        input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

        outputs = model(input_ids)[0]

        # Flatten both outputs and targets
        outputs_flat = outputs.view(-1, outputs.shape[-1])
        target_ids_flat = target_ids.view(-1)

        loss = loss_fn(outputs_flat, target_ids_flat)
        loss.backward()
        optimizer.step()

        # Stop the timer
        end_time = time.time()

        # Calculate the elapsed time
        elapsed_time = end_time - start_time

        print(f"batch took {elapsed_time:.6f} seconds")




batch took 6.077526 seconds
batch took 3.356697 seconds
batch took 2.659770 seconds
batch took 3.054612 seconds
batch took 3.473520 seconds
batch took 3.271823 seconds
batch took 2.693155 seconds
batch took 3.012774 seconds
batch took 2.855958 seconds
batch took 2.985354 seconds
batch took 3.484536 seconds
batch took 3.865864 seconds


In [None]:
len(train_dataloader)

In [None]:
# Load the fine-tuned model
fine_tuned_model = BertForMaskedLM.from_pretrained("fine_tuned_bert")



In [None]:
# Inference
input_text = "मैं"
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
with torch.no_grad():
    outputs = fine_tuned_model(torch.tensor(input_ids).unsqueeze(0))
    predicted_token_id = torch.argmax(outputs[0, -1]).item()
    predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id)

print("Predicted next word:", predicted_word)