<a href="https://colab.research.google.com/github/Daksh024/NSP/blob/Colab/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForMaskedLM, AdamW

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:

# Load and tokenize text corpus from a file
corpus_file_path = "/content/drive/MyDrive/tinyCorpus.txt"
with open(corpus_file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()


In [4]:

data = []
for line in lines:
    tokens = tokenizer.tokenize(line)
    for i in range(len(tokens) - 1):
        data.append((tokens[i], tokens[i+1]))


In [5]:

# Define custom dataset
class NextWordDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_token, target_token = self.data[idx]
        input_ids = self.tokenizer.convert_tokens_to_ids(input_token)
        target_token_id = self.tokenizer.convert_tokens_to_ids(target_token)
        return torch.tensor(input_ids), torch.tensor(target_token_id)

# Create DataLoader for the dataset
train_dataset = NextWordDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()





In [7]:

# # Fine-tuning loop
# num_epochs = 10
# for batch in train_dataloader:
#     optimizer.zero_grad()
#     input_ids, target_ids = batch

#     # Ensure input_ids is a 2D tensor
#     input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

#     outputs = model(input_ids)[0]
#     loss = loss_fn(outputs.view(-1, outputs.shape[-1]), target_ids)
#     loss.backward()
#     optimizer.step()

import time

num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Start the timer
start_time = time.time()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, target_ids = batch

        # Move data to GPU
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)

        # print(input_ids)

        # Ensure input_ids is a 2D tensor
        input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

        outputs = model(input_ids)[0]

        # Flatten both outputs and targets
        outputs_flat = outputs.view(-1, outputs.shape[-1])
        target_ids_flat = target_ids.view(-1)

        loss = loss_fn(outputs_flat, target_ids_flat)
        loss.backward()
        optimizer.step()

end_time = time.time()


In [12]:
elapsed_time = end_time - start_time
print(f"batch took {elapsed_time:.6f} seconds")

batch took 147.466052 seconds


In [8]:
model.save_pretrained("fine_tuned_bert_model")
tokenizer.save_pretrained("fine_tuned_bert_model")

('fine_tuned_bert_model/tokenizer_config.json',
 'fine_tuned_bert_model/special_tokens_map.json',
 'fine_tuned_bert_model/vocab.txt',
 'fine_tuned_bert_model/added_tokens.json')

In [9]:
# Load the fine-tuned model
fine_tuned_model = BertForMaskedLM.from_pretrained("fine_tuned_bert_model")



In [10]:
# # Inference
# input_text = "मैं"
# input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# with torch.no_grad():
#     outputs = fine_tuned_model(torch.tensor(input_ids).unsqueeze(0))
#     predicted_token_id = torch.argmax(outputs[0, -1]).item()
#     predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id)

# print("Predicted next word:", predicted_word)

# print(outputs.logits)

# Input text
input_text = "मैं"

# Encode input text and perform inference
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
with torch.no_grad():
    outputs = fine_tuned_model(torch.tensor(input_ids).unsqueeze(0))
    print(outputs.logits)
    predicted_token_id = torch.argmax(outputs.logits[0, -1]).item()
    predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id)

print("Predicted word:", predicted_word)

tensor([[[-6.7139, -7.6789, -6.9048,  ..., -6.3974, -6.1907, -6.0965],
         [-6.9662, -7.6907, -7.6814,  ..., -6.7100, -5.8716, -6.6599],
         [-5.7604, -7.0342, -6.8200,  ..., -6.3531, -5.1109, -5.8730],
         [-6.7318, -7.6566, -7.1837,  ..., -6.7691, -5.6489, -7.0067]]])
Predicted word: स


In [19]:
# Input text
input_text = "मैं"

# Encode input text
input_ids = tokenizer.encode(input_text, add_special_tokens=True)

generated_ids = input_ids.copy()
with torch.no_grad():
    for _ in range(50):
        input_tensor = torch.tensor(generated_ids).unsqueeze(0)
        outputs = fine_tuned_model(input_tensor)
        predicted_token_id = torch.argmax(outputs.logits[0, -1]).item()
        generated_ids.append(predicted_token_id)
        if predicted_token_id == tokenizer.sep_token_id:
            break

# Convert the generated IDs to words
generated_words = tokenizer.convert_ids_to_tokens(generated_ids)

# Print the generated sequence
generated_sequence = " ".join(generated_words)
print("Generated sequence:", generated_sequence)
tokenizer.convert_tokens_to_string(generated_words)

Generated sequence: [CLS] म ##ैं [SEP] स ##ु ##म ##झ ##े की त ##हत ##ा ##उन ##ल , स ##ु ##ध ##्या पर न ##ाग ##रिक ##रिक ##ता है । ये त ##हत ##ा ##उन ##ल में म ##ौ ##जूद एस ##ई दे ##खने के लिए केंद्र ##ीय ##ीय सरकार व ##र ##न ##ून ##ून ##ी


'[CLS] मैं [SEP] सुमझे की तहताउनल , सुध्या पर नागरिकरिकता है । ये तहताउनल में मौजूद एसई देखने के लिए केंद्रीयीय सरकार वरनूनूनी'

In [13]:
from transformers import BertTokenizer

# Load the tokenizer for a multilingual BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Hindi input text
hindi_text = "मैंने खाना खाया"

# Tokenize the Hindi text
tokens = tokenizer.tokenize(hindi_text)
print(tokens)

# Convert tokens to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)


['म', '##ैं', '##ने', 'खान', '##ा', 'खा', '##या']
[889, 99007, 13466, 101415, 11208, 64566, 15168]


In [14]:
from transformers import BertTokenizer

# Load the tokenizer for a multilingual BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# List of token IDs for Hindi words
hindi_word_ids = [tokenizer.convert_tokens_to_ids(token) for token in ['म', '##ैं', '##ने', 'खान', '##ा', 'खा', '##या']]

# Convert token IDs back to words
hindi_words = tokenizer.convert_ids_to_tokens(hindi_word_ids)
print(hindi_words)

['म', '##ैं', '##ने', 'खान', '##ा', 'खा', '##या']


In [17]:
tokenizer.convert_tokens_to_string(['म', '##ैं', '##ने', 'खान', '##ा', 'खा', '##या'])

'मैंने खाना खाया'