In [1]:
import warnings
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Suppress the warning
warnings.filterwarnings("ignore")

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import pandas as pd
import numpy as np
import torch
dataset = pd.read_excel('final.xlsx')

input_texts = dataset["User Story"].tolist()
output_texts = dataset["Description"].tolist()

In [3]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenized_inputs = tokenizer(
    input_texts,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

tokenized_outputs = tokenizer(
    output_texts,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

In [4]:
tokenized_inputs

{'input_ids': tensor([[ 101, 1045, 2215,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2215,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        # Ensure the lengths of input_ids and labels match
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'token_type_ids': self.inputs['token_type_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.outputs['input_ids'][idx],  # Assuming 'input_ids' is present in outputs
        }

# Example dataset
dataset = CustomDataset(inputs=tokenized_inputs, outputs=tokenized_outputs)

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./custom-model-bert",
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()



In [None]:
model.save_pretrained("./custom-model-test-case")

In [None]:
tokenizer.save_pretrained('./test-case-model-tokenizer')

('./muril-model-tokenizer\\tokenizer_config.json',
 './muril-model-tokenizer\\special_tokens_map.json',
 './muril-model-tokenizer\\vocab.txt',
 './muril-model-tokenizer\\added_tokens.json',
 './muril-model-tokenizer\\tokenizer.json')

In [None]:
from transformers import pipeline

def generate_response(user_question, model, tokenizer):
    input_ids = tokenizer.encode(user_question, return_tensors="pt", max_length=256, truncation=True)
#     # Decode
#     with torch.no_grad():
#         outputs = model.generate(input_ids)

#     # Decode the generated output
#     decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     print(decoded_output)
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, length_penalty=0.6, no_repeat_ngram_size=2)
    chatbot_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return chatbot_response

In [None]:
# Load your custom model and tokenizer
custom_model = AutoModelForMaskedLM.from_pretrained("./custom-model-test-case")
custom_tokenizer = AutoTokenizer.from_pretrained("test-case-model-tokenizer")

# Example usage
user_question = " I want to move on to round 3 of the Help page edits, so that I can get approvals from leadership."
chatbot_response = generate_response(user_question, custom_model, custom_tokenizer)
print(chatbot_response)