In [1]:
!nvidia-smi

Thu Nov 30 05:56:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq bitsandbytes==0.39.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71 --progress-bar off
!pip install -qqq datasets==2.12.0 --progress-bar off
!pip install -qqq loralib==0.1.1 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off

In [None]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
notebook_login()

## Data

Data - https://www.kaggle.com/datasets/saadmakhdoom/ecommerce-faq-chatbot-dataset

In [None]:
with open("/kaggle/input/necdatasetv1/nec_faq.json") as json_file:
    data = json.load(json_file)

In [None]:
pprint(data["questions"][1], sort_dicts=False)

In [None]:
pprint(data["questions"][2], sort_dicts=False)

In [None]:
pprint(data["questions"][3], sort_dicts=False)

In [None]:
with open("dataset.json", "w") as f:
    json.dump(data["questions"], f)

In [None]:
pd.DataFrame(data["questions"]).head()

## Load Falcon Model & Tokenizer

In [None]:
MODEL_NAME = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

## Build HuggingFace Dataset

In [None]:
# data = load_dataset("json", data_files="dataset.json")
from datasets import Dataset, DatasetDict

file_path = "/kaggle/working/dataset.json"

# Load the dataset from the local file using standard file I/O operations
with open(file_path, "r") as file:
    data = json.load(file)

# Convert the list of dictionaries into a Dataset object
train_dataset = Dataset.from_dict({"question": [item["question"] for item in data], "answer": [item["answer"] for item in data]})

# Create a DatasetDict with the 'train' split
data = DatasetDict({"train": train_dataset})

# Now 'dataset_dict' is a DatasetDict object
print(data)

In [None]:
data

In [None]:
data["train"][0]

In [None]:
def generate_prompt(data_point):
    return f"""
<human>: {data_point["question"]}
<assistant>: {data_point["answer"]}
""".strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [None]:
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
data

## Training

In [None]:
OUTPUT_DIR = "experiments"

In [None]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=80,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

## Save Trained Model

In [None]:
model.save_pretrained("trained-model")

In [None]:
model.push_to_hub(
    "NachikethD/NECLLM", use_auth_token=True
)

## Load Trained Model

In [None]:
PEFT_MODEL = "NachikethD/NECLLM"

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

## Inference

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
DEVICE = "cuda:0"

In [None]:
def generate_response(question: str) -> str:
    prompt = f"""
<human>: {question}
<assistant>:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

In [None]:
prompt = "I want to change the OTP number for O365?"
print(generate_response(prompt))

In [None]:
prompt = "What is the NEC 2030 vision?"
print(generate_response(prompt))

In [None]:
prompt = "How to genrate electric journal in NEC MSBU?"

print(generate_response(prompt))

***Observations***
* The Falcon 7b model has been trained on data upto the year 2020. So any new information has to be presented to the LLM and trained on it
* The first prompt shows that the Falcon 7b model even when fine tuned is able to retain and present coherent information, even on data it hasnt been trained on.
* The second prompt shows the response on the data which it has been trained on. Notice that the model has added to the reponse and to the data it has been trained on.
* The third prompt shows that it is able to retireve the current information based on the data it has been trianed on and able to add more relevant information to it.
* Problem in understanding acronymns like MSBU AD ID
* Unable to hold contextual information even after finetuning