In [49]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [50]:
import torch
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline)

In [78]:
import pandas as pd

# Load the Excel file
file_path = '/content/FAQ_list_PAN_TAN_All.xlsx'
df = pd.read_excel(file_path)

# Initialize an empty list to store the cleaned data
cleaned_data = []

# Iterate over the rows in the DataFrame
for index, row in df.iterrows():
    questions = row['Question'].split('\n')  # Split the questions if there are multiple in one cell
    answer = row['Answer'].strip()  # Get the corresponding answer

    for question in questions:
        question = question.strip()  # Clean up any extra whitespace
        if question:  # Ensure the question is not empty
            # Append the cleaned question and answer to the list
            cleaned_data.append({'Question': question, 'Answer': answer})

# Convert the list to a DataFrame
cleaned_df = pd.DataFrame(cleaned_data)

# Save the cleaned dataset to a new file
cleaned_df.to_csv('./cleaned_dataset.csv', index=False)

# Reformat the dataset into the model's required format
with open('./formatted_output.txt', 'w', encoding='utf-8') as f:
    # Iterate through each row and format the text consistently
    for index, row in cleaned_df.iterrows():
        question = row['Question'].strip()
        answer = row['Answer'].strip()
        formatted_text = f"Q: {question}\nA: {answer}\n"
        f.write(formatted_text)

In [79]:
from google.colab import files
files.download('/content/formatted_output.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [80]:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = "openai-community/gpt2",
                                                   quantization_config = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = getattr(torch, "float16"), bnb_4bit_quant_type = "nf4"))
model.config.use_cache = False
model.config.pretraining_tp = 1

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "openai-community/gpt2", trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [82]:
# Load the dataset from the formatted text file
dataset = load_dataset('text', data_files='./formatted_output.txt')

training_arguments = TrainingArguments(output_dir = "./results", per_device_train_batch_size = 4, max_steps = 10000, save_steps=250, logging_steps=100, learning_rate=5e-5)

Generating train split: 0 examples [00:00, ? examples/s]

In [84]:
sft_trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset['train'],
    tokenizer=tokenizer,
    peft_config=LoraConfig(task_type="CAUSAL_LM", r=8, lora_alpha=16, lora_dropout=0.1),
    dataset_text_field="text"
)

Map:   0%|          | 0/1928 [00:00<?, ? examples/s]

In [85]:
sft_trainer.train()

  new_forward = torch.cuda.amp.autocast(dtype=torch.float16)(model_forward_func)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
100,4.9554
200,4.6052
300,4.3014
400,4.0625
500,4.0358
600,3.916
700,3.9284
800,3.7407
900,3.7325
1000,3.665


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=10000, training_loss=3.180506851196289, metrics={'train_runtime': 1162.1461, 'train_samples_per_second': 34.419, 'train_steps_per_second': 8.605, 'total_flos': 569993846575104.0, 'train_loss': 3.180506851196289, 'epoch': 20.75})

In [86]:
user_prompt = "What is AO code?"  #r-8
text_generation_pipeline = pipeline(task = "text-generation", model = model, tokenizer = tokenizer, max_length = 300)
model_answer = text_generation_pipeline(f" {user_prompt} ")
print(model_answer[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  return fn(*args, **kwargs)


 What is AO code? _______ Request a response here on our AO page. ________________________________________________________________ The AO Code is available on Our website. Please email us/email@proteaningap.in for any questions you have.  (For more details about AO, contact the 'Contact Databank' and ask our e-PAN services to list the Income-Statement Status Codes   or e-SRS for more details of any Formal PAN/Non-Formal PAN Formal status  please click on  then submit the PAN Application.  Thank you for sending your PAN Application to __________________________________________________________ https://www.proteanetan.com/en-tinpan/autopoplay.html  URL: https://www.protean-tinpan.com/pan/downloads/pan_final.html
The AO Code ____________________ is available on T-type of e-PAN type forms.  This Code may be downloaded under the Raats tab at  https://www.unonline.com/rattop-unpan/B2.html  and download the Raats Code  from /Penalty/Ticket/Settlement.    Please do not attempt to register any f

In [None]:
text_generation_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    do_sample=True,
)

def chatbot():
    print("Chatbot is running... Type 'exit' to stop.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break
        else:
            # Format the input as required by the model
            formatted_input = f" {user_input} Chatbot: "
            # Generate the response
            response = text_generation_pipeline(formatted_input)
            # Print the generated response
            print("Chatbot:", response[0]['generated_text'].replace(formatted_input, "").strip())

# Run the chatbot
chatbot()


Chatbot is running... Type 'exit' to stop.
You: What is Area Code?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: ⚇ 2:0052 to 3rd pk ett: Facade daf. I.R.C." (No data or documents attached to Facade)  https://www.in-addrinfo.ru/openg.htm to check if you are given T&Cs when registering with Facade  (https://www.opentechonline.com/documents/FinTechOnline1.pdf).   http://www.fax.gov.in/FileSender.html for more details).  -    - -   Identification  https://www.opentech.org/services/PAN_IT_Identity#post_Statement.html  - -   Download the Form     https://www.protean-online/pancpc.html   - -
