<a href="https://colab.research.google.com/github/Daksh024/NSP/blob/Colab/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.5 MB/s[0m eta [36m0:00:0

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForMaskedLM, AdamW

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

# Load and tokenize text corpus from a file
corpus_file_path = "/content/drive/MyDrive/tinyCorpus.txt"
with open(corpus_file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()


In [None]:

data = []
for line in lines:
    tokens = tokenizer.tokenize(line)
    for i in range(len(tokens) - 1):
        data.append((tokens[i], tokens[i+1]))


In [None]:

# Define custom dataset
class NextWordDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_token, target_token = self.data[idx]
        input_ids = self.tokenizer.convert_tokens_to_ids(input_token)
        target_token_id = self.tokenizer.convert_tokens_to_ids(target_token)
        return torch.tensor(input_ids), torch.tensor(target_token_id)

# Create DataLoader for the dataset
train_dataset = NextWordDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()





In [None]:

# # Fine-tuning loop
# num_epochs = 10
# for batch in train_dataloader:
#     optimizer.zero_grad()
#     input_ids, target_ids = batch

#     # Ensure input_ids is a 2D tensor
#     input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

#     outputs = model(input_ids)[0]
#     loss = loss_fn(outputs.view(-1, outputs.shape[-1]), target_ids)
#     loss.backward()
#     optimizer.step()


num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


for epoch in range(num_epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, target_ids = batch

        # Move data to GPU
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)

        # print(input_ids)

        # Ensure input_ids is a 2D tensor
        input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids

        outputs = model(input_ids)[0]

        # Flatten both outputs and targets
        outputs_flat = outputs.view(-1, outputs.shape[-1])
        target_ids_flat = target_ids.view(-1)

        loss = loss_fn(outputs_flat, target_ids_flat)
        loss.backward()
        optimizer.step()



In [None]:
model.save_pretrained("fine_tuned_bert_model")
tokenizer.save_pretrained("fine_tuned_bert_model")

('fine_tuned_bert_model/tokenizer_config.json',
 'fine_tuned_bert_model/special_tokens_map.json',
 'fine_tuned_bert_model/vocab.txt',
 'fine_tuned_bert_model/added_tokens.json')

In [None]:
# Load the fine-tuned model
fine_tuned_model = BertForMaskedLM.from_pretrained("fine_tuned_bert_model")



In [None]:
# # Inference
# input_text = "मैं"
# input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# with torch.no_grad():
#     outputs = fine_tuned_model(torch.tensor(input_ids).unsqueeze(0))
#     predicted_token_id = torch.argmax(outputs[0, -1]).item()
#     predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id)

# print("Predicted next word:", predicted_word)

# print(outputs.logits)

# Input text
input_text = "मैं"

# Encode input text and perform inference
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
with torch.no_grad():
    outputs = fine_tuned_model(torch.tensor(input_ids).unsqueeze(0))
    print(outputs.logits)
    predicted_token_id = torch.argmax(outputs.logits[0, -1]).item()
    predicted_word = tokenizer.convert_ids_to_tokens(predicted_token_id)

print("Predicted word:", predicted_word)

tensor([[[-6.7031, -7.6373, -7.1604,  ..., -6.2894, -5.8818, -5.5827],
         [-8.3172, -8.8643, -8.5447,  ..., -7.9228, -7.1353, -7.3976],
         [-7.6103, -7.9862, -7.6492,  ..., -7.9528, -7.1725, -7.0978],
         [-8.7623, -9.6253, -8.5179,  ..., -7.8879, -7.0320, -7.7574]]])
Predicted word: प


In [None]:
# Input text
input_text = "मैं"

# Encode input text
input_ids = tokenizer.encode(input_text, add_special_tokens=True)

generated_ids = input_ids.copy()
with torch.no_grad():
    for _ in range(50):
        input_tensor = torch.tensor(generated_ids).unsqueeze(0)
        outputs = fine_tuned_model(input_tensor)
        predicted_token_id = torch.argmax(outputs.logits[0, -1]).item()
        generated_ids.append(predicted_token_id)
        if predicted_token_id == tokenizer.sep_token_id:
            break

# Convert the generated IDs to words
generated_words = tokenizer.convert_ids_to_tokens(generated_ids)

# Print the generated sequence
generated_sequence = " ".join(generated_words)
print("Generated sequence:", generated_sequence)

Generated sequence: [CLS] म ##ैं [SEP] प ##ि ##छ ##ली ##ली तरह तरह न ##ज ##ट से पी ##ड़ ##क एक बार बार क ##सा ##ब ##ंद ##ाज ##ाज ##ा नहीं प ##ि ##छ ##ली ने कहा , ब ##ज ##ट के वि ##धा ##यक ##यक व व व व व व ##ृ ##द ##े हैं
