# LLM for Recommendation System - RAG

## TABLE OF CONTENT
### $~~~$ - 1. Load Data
### $~~~$ - 2. Contruct Knowledge Base
### $~~~$ - 3. Chunk Documents
### $~~~$ - 4. Load Tokenizer and Model from HuggingFace
### $~~~$ - 5. Embeddings and Retriever
### $~~~$ - 6. Recommendation System
### $~~~$ - 7. Apply xAI

---
## 1. Load Data

In [None]:
import pandas as pd
import os

In [None]:
base_dir = "../"

In [None]:
# Load Amazon products datasets
products_path = os.path.join(base_dir, 'trainData/amazon_products.train.csv')

In [None]:
# Read csv
products_df = pd.read_csv(products_path)

In [None]:
# Display basic information about the datasets
print("[*] VTN Products Dataset:")
products_df.info()

In [None]:
products_df.head()

### (Optional) Drop rows without columns

In [None]:
products_df.dropna(inplace=True)
products_df.reset_index(inplace=True, drop=True)
products_df.info()

### Construct Text

In [None]:
def construct_text(row):
    return (
        f"Product ID: {row['PRODUCT_ID']}\n"
        f"Title: {row['TITLE'].replace('\n', ' ')}\n"
        f"Description: {row['DESCRIPTION'].replace('\n', ' ')}\n"
        f"Category: {row['MAIN_CATEGORY']}\n"
        f"Average rating: {row['AVERAGE_RATING']}\n"
        f"Price: {row['PRICE']}\n"
        f"Details: {' | '.join((row['DETAILS'].strip('{}').replace('\'', '').split(', ')))}"
    )

In [None]:
product_texts = products_df.apply(construct_text, axis=1).tolist()
print(f"[*] Text format preview:\n{product_texts[6]}\n\n{product_texts[7]}")

In [None]:
formatted_df = pd.DataFrame({
    'PRODUCT_ID': products_df['PRODUCT_ID'].tolist(), 
    'TITLE': products_df['TITLE'].tolist(), 
    'DESCRIPTION': products_df['DESCRIPTION'].tolist(), 
    'CATEGORY': products_df['MAIN_CATEGORY'].tolist(), 
    'TEXT': product_texts
})
formatted_df

---
## 2. Contruct Knowledge Base

In [None]:
from langchain.docstore.document import Document as LangchainDocument
from tqdm import tqdm

In [None]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=formatted_df.loc[i, "DESCRIPTION"], metadata={
        "id": formatted_df.loc[i, "PRODUCT_ID"], 
        "title": formatted_df.loc[i, "TITLE"], 
        "category": formatted_df.loc[i, "CATEGORY"], 
        "text": formatted_df.loc[i, "TEXT"],
    }) for i in tqdm(formatted_df.index)
]

In [None]:
RAW_KNOWLEDGE_BASE[6]

---
## 3. Chunk Documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
customer_SEP = [
    "\n",
    ". ",
    ".",
    " ",
    "",
]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=customer_SEP,
)

In [None]:
docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

In [None]:
docs_processed[6]

---
## 4. Load Tokenizer and Model from HuggingFace

In [None]:
# Check Python vision
!python -V
# Check CUDA vision
!nvcc --version

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from getpass import getpass
import torch

In [None]:
# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available else "cpu")
#device = 'cpu' # Set to cpu when debugging
print(f"Using device: {device}")

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
access_token = getpass()
os.environ['HUGGINGFACEHUB_API_TOKEN'] = access_token

In [None]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
# model_id = "Qwen/Qwen2.5-1.5B-Instruct"

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("[*] Tokenizer loaded.")

In [None]:
# Load Model
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    token=access_token,
    # quantization_config=bnb_config,
).to(device)
print("[*] Model loaded.")

---
## 5. Embeddings and Retriever

### Check length

In [None]:
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
print(f"[*] Model's maximum sequence length: {SentenceTransformer(model_id).max_seq_length}")

In [None]:
token_lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
print(f'[*] Max Token Length: {np.max(token_lengths)}')
print(f'[*] Token Length <= 512: {round((len([x for x in token_lengths if x <= 512])/len(token_lengths))*100, 2)}%')
print(f'[*] Token Length <= 1024: {round((len([x for x in token_lengths if x <= 1024])/len(token_lengths))*100, 2)}%')

In [None]:
plt.hist(token_lengths, bins=20)
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")
plt.show()

### Building the vector database

In [None]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

In [None]:
embedding_model_id = "sentence-transformers/all-MiniLM-L6-v2"

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
    multi_process=True,
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

In [None]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

### Save Vector Database

In [None]:
vector_db_dir = os.path.join(base_dir, 'Vector_DB')
KNOWLEDGE_VECTOR_DATABASE.save_local(vector_db_dir)

### Load Vector Database

In [None]:
vector_db_dir = os.path.join(base_dir, 'Vector_DB')
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
    vector_db_dir,
    embeddings=vector_db_dir,
    allow_dangerous_deserialization=True,
)

### Test Query

In [None]:
from time import time 
import random

In [None]:
def retrieve_product_information(df, query_value):
    product_index = df.index[df['PRODUCT_ID'] == query_value].tolist()[0]
    full_text = formatted_df.loc[product_index, 'TEXT']
    print(f'[*] Retrieved product full content:\n{full_text}')

    return formatted_df.loc[product_index, 'DESCRIPTION'], full_text

In [None]:
random.seed(time())
random_product_id = random.choice(formatted_df['PRODUCT_ID'])
test_description, full_text = retrieve_product_information(formatted_df, random_product_id)

In [None]:
print(f"[*] Starting retrieval for description:\n{test_description=}\n")
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=test_description, k=6)[1:] # The first one will always be the qurey one, so skip it.
print("==================================Top document==================================")
print(retrieved_docs[0].page_content)
print("====================================Full Content====================================")
print(retrieved_docs[0].metadata['text'])

---
## 6. Recommendation System

In [None]:
from transformers import pipeline

In [None]:
Rec_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=1000,
    device=device
)

In [None]:
# Test
Q = "What is 4+4? Answer:"
A = Rec_LLM(Q)
print(f'[*] {Q}{A[0]['generated_text']}')

### Prompt Template

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Response should include product id, title, and reason for recommendation.
Information of recommended products must be correct, do not falsify information.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

### Recommendation Test

In [None]:
random.seed(time())
random_product_id = random.choice(formatted_df['PRODUCT_ID'])
test_description, full_text = retrieve_product_information(formatted_df, random_product_id)

In [None]:
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=test_description, k=11)[1:] # The first one will always be the qurey one, so skip it.

In [None]:
retrieved_docs_text = [
    doc.metadata['text'] for doc in retrieved_docs
]  # We only need the text of the documents

In [None]:
context = "\nExtracted products:"
context += "".join(
    [f"\n\nProduct {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]
)

In [None]:
final_prompt = RAG_PROMPT_TEMPLATE.format(
    question="Base on this product, recommend 5 best products from Context.", context=context
)

In [None]:
# Redact an answer
recommedations = Rec_LLM(final_prompt)[0]["generated_text"]
print(recommedations)

In [None]:
print(final_prompt)

---
## 7. Apply xAI

In [None]:
from captum.attr import (
    FeatureAblation, 
    ShapleyValueSampling,
    LayerIntegratedGradients, 
    LLMAttribution, 
    LLMGradientAttribution, 
    TextTokenInput, 
    TextTemplateInput,
    ProductBaselines,
)

In [None]:
targets = recommedations.split('\n\n')[1:]
# targets

In [None]:
eval_prompts = retrieved_docs_text
# eval_prompts

### Perturbation-based Attribution

In [None]:
def PBA_eval(tokenizer, eval_prompt, target):
    skip_tokens = [1]  # skip the special token for the start of the text <s>
    inp = TextTokenInput(
        eval_prompt, 
        tokenizer,
        skip_tokens=skip_tokens,
    )

    fa = FeatureAblation(model)
    llm_attr = LLMAttribution(fa, tokenizer)

    print('[*] Calculating attribution...')
    attr_res = llm_attr.attribute(
        inp, 
        target=target, 
        skip_tokens=skip_tokens,
    )

    print("[*] Attribution to the output sequence:", attr_res.seq_attr.shape)  # shape(n_input_token)
    print("[*] Attribution to the output tokens:", attr_res.token_attr.shape)  # shape(n_output_token, n_input_token)

    attr_res.plot_token_attr(show=True)

    return attr_res

In [None]:
attr_res = PBA_eval(tokenizer, eval_prompts[0], targets[0])

### Shapley Value Sampling

In [None]:
def SV_PBA_eval(tokenizer, eval_prompt, target):
    skip_tokens = [1]  # skip the special token for the start of the text <s>
    inp = TextTokenInput(
        eval_prompt, 
        tokenizer,
        skip_tokens=skip_tokens,
    )

    sv = ShapleyValueSampling(model)
    sv_llm_attr = LLMAttribution(sv, tokenizer)

    print('[*] Calculating attribution...')
    attr_res = sv_llm_attr.attribute(
        inp, 
        target=target, 
        # skip_tokens=skip_tokens, 
        num_trials=3
    )

    print("[*] Attribution to the output sequence:", attr_res.seq_attr.shape)  # shape(n_input_token)
    print("[*] Attribution to the output tokens:", attr_res.token_attr.shape)  # shape(n_output_token, n_input_token)

    attr_res.plot_token_attr(show=True)

    return attr_res

In [None]:
sv_attr_res = SV_PBA_eval(tokenizer, eval_prompts[0], targets[0])