# LLM for Recommendation System - RAG

## TABLE OF CONTENT
### $~~~$ - 1. Recommendation System
### $~~~$ - 2. Result Interpretation

---
## 1. Recommendation System

In [None]:
# Check Python vision
!python -V
# Check CUDA vision
!nvcc --version

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
from getpass import getpass
from time import time
import pandas as pd
import numpy as np
import random
import torch
import os
import re

In [None]:
# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu' # Set to cpu when debugging
print(f"Using device: {device}")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
access_token = getpass("Enter access token: ")
os.environ['HUGGINGFACEHUB_API_TOKEN'] = access_token

base_dir = "../.."

In [None]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
# model_id = "Qwen/Qwen2.5-1.5B-Instruct"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("[*] Tokenizer loaded.")

# Load Model
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    token=access_token,
).to(device)
print("[*] Model loaded.")

In [None]:
embedding_model_id = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
    multi_process=True,
    model_kwargs={"device": device},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

vector_db_dir = os.path.join(base_dir, 'Vector_DB')
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
    vector_db_dir,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

In [None]:
formatted_df = pd.read_csv(os.path.join(base_dir, 'trainData/amazon_products.train.formatted.csv'))

def retrieve_product_information(df, query_value):
    product_index = df.index[df['PRODUCT_ID'] == query_value].tolist()[0]
    full_text = df.loc[product_index, 'TEXT']
    product_id = df.loc[product_index, 'PRODUCT_ID']
    print(f'[*] Retrieved product full content:\n{full_text}')

    return df.loc[product_index, 'DESCRIPTION'], full_text, product_id

In [None]:
Rec_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=1000,
    device=device
)

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in context, give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Information of recommended products must be correct and matched in context, do not falsify information.
If the answer cannot be deduced from the context, do not give an answer.

Response must include product id, title, and reason for recommendation.
Response must strictly follow the template below:
i. **Product ID: <Product ID>** - <Title>
Reason: <Reason>

Answer examples:
1. **Product ID: B0C3WNM5X7** - Simple Joys by Carter's Toddler Boys' Hooded Sweater Jacket with Sherpa Lining
Reason: This product is highly rated with an average rating of 4.8, offering excellent value for its price.

2. **Product ID: B0C1X12894** - Oversized Wearable Blanket Hoodie for Women Men Comfy Sweatshirt
Reason: This product is highly rated with an average rating of 4.8, making it a great option for those looking for a cozy and warm garment.

3. **Product ID: B0C68CBFKS** - Columbia Women's West Bend Hoodie
Reason: This product is highly rated with an average rating of 4.4, offering a reliable and affordable option for casual wear.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}

---

Now here is the question you need to answer.

Question: {question}""",
    },
]

RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

In [None]:
random.seed(time())
random_product_id = random.choice(formatted_df['PRODUCT_ID'])
test_description, full_text, target_id = retrieve_product_information(formatted_df, random_product_id)

retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=test_description, k=11)[1:] # The first one will always be the qurey one, so skip it.
retrieved_docs_text = [
    doc.metadata['text'] for doc in retrieved_docs
]  # We only need the text of the documents

context = "\nExtracted products:"
context += "".join(
    [f"\n\nProduct {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]
)

final_prompt = RAG_PROMPT_TEMPLATE.format(
    question="Base on this product, recommend 5 best products from Context.", context=context
)

In [None]:
# Redact an answer
recommedations = Rec_LLM(final_prompt)[0]["generated_text"]
print(recommedations)

---
## 2. Result Interpretation

### Umap

In [None]:
# Access the FAISS index
faiss_index = KNOWLEDGE_VECTOR_DATABASE.index

# Access metadata
metadata = KNOWLEDGE_VECTOR_DATABASE.docstore._dict  # Metadata is typically stored here

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

In [None]:
vectors = faiss_index.reconstruct_n(0, faiss_index.ntotal)

In [None]:
# Reduce dimensions to 3D
pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)

In [None]:
# Plot in 3D
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(
    reduced_vectors[:, 0],
    reduced_vectors[:, 1],
    reduced_vectors[:, 2],
    alpha=0.3
)

ax.set_title("3D Visualization of Products Vector Database", fontsize=40)
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.tick_params(axis='z', labelsize=15)

plt.tight_layout()
plt.show()

#### show with categories

In [None]:
vector_categories = {}
for i, v in tqdm(enumerate(metadata.values())):
    curr_category = v.metadata['category']
    vector_categories.setdefault(curr_category, [])
    vector_categories[curr_category].append(reduced_vectors[i])

In [None]:
# sort in descending by category length
sorted_vector_categories = dict(sorted(vector_categories.items(), key=lambda item: len(item[1]), reverse=True))

In [None]:
for k, v in sorted_vector_categories.items():
    print(k, len(v))

In [None]:
# Plot in 3D
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection='3d')

for idx, (subset_name, subset_vectors) in enumerate(sorted_vector_categories.items()):
    subset_vectors = np.array(subset_vectors)
    ax.scatter(
        subset_vectors[:, 0],
        subset_vectors[:, 1],
        subset_vectors[:, 2],
        label=subset_name.lower().title(),  # Add label for the legend
        alpha=0.3
    )

ax.set_title("3D Visualization of Products Vector Database", fontsize=40)
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.tick_params(axis='z', labelsize=15)
ax.legend(fontsize=15)

plt.tight_layout()
plt.show()

#### mark selected products

In [None]:
pattern = re.compile(r"\*\*(.*)\*\*")
ids = [x.split()[-1] for x in pattern.findall(recommedations)]
print(ids)

In [None]:
recommedation_vectors = {}
target_retrieved, recommedations_retrieved = 0, 0
for i, v in enumerate(metadata.values()):
    curr_id = v.metadata['id']
    if curr_id == target_id and target_retrieved == 0:
        recommedation_vectors.setdefault('Target Product', [])
        recommedation_vectors['Target Product'].append(reduced_vectors[i])
        target_retrieved += 1
        
    elif curr_id in ids and recommedations_retrieved < 5:
        recommedation_vectors.setdefault('Recommended Product', [])
        recommedation_vectors['Recommended Product'].append(reduced_vectors[i])
        recommedations_retrieved += 1

In [None]:
for k, v in recommedation_vectors.items():
    print(f'{k}:{len(v)}')

In [None]:
# Plot in 3D
fig = plt.figure(figsize=(40, 20))

'''Vector Database'''
ax1 = fig.add_subplot(121, projection='3d')

for idx, (subset_name, subset_vectors) in enumerate(sorted_vector_categories.items()):
    subset_vectors = np.array(subset_vectors)

    # Reduce background size
    # subset_vectors_len = len(subset_vectors)
    # if subset_vectors_len >= 1000:
    #     subset_vectors = subset_vectors[:1000]
    # else:
    #     subset_vectors = subset_vectors[:int(round(subset_vectors_len/2,0))]
    
    ax1.scatter(
        subset_vectors[:, 0],
        subset_vectors[:, 1],
        subset_vectors[:, 2],
        label=subset_name.lower().title(),  # Add label for the legend
        alpha=0.3,
    )

ax1.set_title("3D Visualization of Products Vector Database (Whole)", fontsize=40)
ax1.tick_params(axis='x', labelsize=15)
ax1.tick_params(axis='y', labelsize=15)
ax1.tick_params(axis='z', labelsize=15)
ax1.legend(fontsize=15)

'''Target & Recommendation'''
ax2 = fig.add_subplot(122, projection='3d')

# Vector Database
for idx, (subset_name, subset_vectors) in enumerate(sorted_vector_categories.items()):
    subset_vectors = np.array(subset_vectors)

    # Reduce background size
    subset_vectors_len = len(subset_vectors)
    if subset_vectors_len >= 1000:
        subset_vectors = subset_vectors[:500]
    else:
        subset_vectors = subset_vectors[:int(round(subset_vectors_len/2,0))]
    
    ax2.scatter(
        subset_vectors[:, 0],
        subset_vectors[:, 1],
        subset_vectors[:, 2],
        label=subset_name.lower().title(),  # Add label for the legend
        alpha=0.3,
    )
    
# Target Product & Recommeded Products
for idx, (subset_name, subset_vectors) in enumerate(recommedation_vectors.items()):
    subset_vectors = np.array(subset_vectors)

    marker = 'x'
    if subset_name == 'Target Product':
        marker = 'o'

    ax2.scatter(
        subset_vectors[:, 0],
        subset_vectors[:, 1],
        subset_vectors[:, 2],
        label=subset_name,  # Add label for the legend
        alpha=1,
        marker=marker,
        color='black',
        linewidths=3,
        s=300,
    )

ax2.set_title("3D Visualization of Target Product & Recommended Products", fontsize=40)
ax2.tick_params(axis='x', labelsize=15)
ax2.tick_params(axis='y', labelsize=15)
ax2.tick_params(axis='z', labelsize=15)
ax2.legend(fontsize=15)

plt.tight_layout()
# plt.savefig('3DVector.png', dpi=300)
plt.show()

### Text

In [None]:
pattern = re.compile(r"\*\*(.*)\*\*")
ids = [x.split()[-1] for x in pattern.findall(recommedations)]
print(ids)

In [None]:
recommendation_infos = {}
for v in tqdm(metadata.values()):
    curr_id = v.metadata['id']        
    if curr_id in ids:
        recommendation_infos.setdefault(curr_id, '')
        recommendation_infos[curr_id] = v.metadata['text']

In [None]:
for v in recommendation_infos.values():
    print(f'{v}\n')

### Captum

In [None]:
# from captum.attr import (
#     FeatureAblation, 
#     ShapleyValues,
#     LayerIntegratedGradients, 
#     LLMAttribution, 
#     LLMGradientAttribution, 
#     TextTokenInput, 
#     TextTemplateInput,
#     ProductBaselines,
# )

In [None]:
# lig = LayerIntegratedGradients(model, model.model.embed_tokens)
# llm_attr = LLMGradientAttribution(lig, tokenizer)

In [None]:
# inp = TextTokenInput(
#     final_prompt,
#     tokenizer,
#     skip_tokens=[1],
# )
# attr_res = llm_attr.attribute(
#     inp, 
#     target=recommedations
# )

In [None]:
# attr_res.plot_seq_attr(show=True)

In [None]:
# attr_res.plot_token_attr(show=True)

In [None]:
retrieved_docs