# Team 2 - NUS ISS Assignement - RAG 

Context: We want to build a LLM using RAG Bookstore to recommend users books to read accordingly to their interest.

In [15]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, GenerationConfig

from smolagents import tool, Tool

In [2]:
# Setting up of the LLM Model 
# We Are using google/flan-t5-small
model_name = "google/flan-t5-small"

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
# Load content from the EPUB file
# epub_loader = UnstructuredEPubLoader(file_path='data/charles-dickens_a-christmas-carol.epub')
# epub_loader = UnstructuredEPubLoader(file_path='data/the_gift_of_the_magi.epub')
# epub_loader = UnstructuredEPubLoader(file_path='data/the_happy_prince.epub')
epub_loader = UnstructuredEPubLoader(file_path='data/the_nightingale_and_the_rose.epub')
doc = epub_loader.load()

print(doc)

In [None]:
# Define the chunk size and overlap for text splitting
# This will determine how the text is divided into smaller segments for processing
# Adjust these values based on your specific requirements
chunks_size = 1024
chunks_overlap = 50

# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunks_size,
    chunk_overlap=chunks_overlap
)

# Split the document into chunks
# This will create smaller text segments that can be processed by the model
chunks = text_splitter.split_documents(doc)
print(f"Number of chunks: {len(chunks)}")
print(f"First chunk: {chunks[0].page_content[:500]}...")  # Display the first 500 characters of the first chunk

In [None]:
# Create embedding model
embed_model_name = 'BAAI/bge-small-en-v1.5'
embed_model_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=embed_model_name
)

In [6]:
# Prepare the chunks for inserting into Chroma
texts = [d.page_content for d in chunks]

# Generate PK foreach text chunk
texts_ids = [str(uuid4())[:8] for _ in range(len(texts))]

In [None]:
# Create ephemeral Chroma client and save chunks
collection_name = 'epub'

# Create a Chroma client
chroma_client = chromadb.Client()

# Create a embeeding function
embed_model_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

try:
    # Clean up collection
    chroma_client.delete_collection(name=collection_name)
except Exception as e:
    pass

In [None]:
# Create a new collection with the specified name and embedding function
collection = chroma_client.create_collection(
    name=collection_name,
    embedding_function=embed_model_func
)

# If the document <=0, than load chunks into the collection
if collection.count() == 0:
    print("Inserting chunks document into Chroma collection...")
    collection.add(
        documents=texts,
        ids=texts_ids
    )

print(f"Number of documents in collection '{collection_name}': {collection.count()}")

In [10]:
# Create GeneraetionConfig for the model
config = GenerationConfig(
    do_sample = True,
    temperature= 0.7,
    top_k= 1
)

In [17]:
# Create a function to pass query to model and get response
def query_model(query, config):
    inputs = tokenizer(query, return_tensors="pt")
    outputs = model.generate(**inputs, generation_config=config) # **inputs represents the input tensors (inputs_ids)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Create a tool for Agent to use
@tool
def agent_query_tool(prompt: str) -> str:
    """
    Perform a chroma query on the book tables. Return the result as an array of records.
    The table has the following columns:
    - id: str
    - title: str
    - author: str
    - summary: str
    - content: str

    Args:
        <To be changed based on starting data injected in>
        prompt: The user's question about the book title, author, author, summary and content.

    Returns:
        <To be changed based on starting data injected in>
        str: list of tuple. Each element corresponds to a record from the query.

    Example:
        <To be changed based on starting data injected in>
        result = query_tool("Who is the author?")
    """

    # Number of top results to return
    top_k = 5 

    # Query the collection for relevant documents
    collection_query = collection.query(
        query_texts = [prompt],
        n_results = top_k # Number of top results to return
    )

    # Build context from the top results
    context= ""
    for id in collection_query['ids'][0]:
        doc = collection.get(ids=[id])
        context += doc['documents'][0] + "\n"

    # Enrich the prompt with the context
    enriched_prompt = (
        f"Answer based on context:\n\n"
        f"{context}\n"
        f"Top {top_k} results from the ChromaDb database based on the question\n"
        f"{prompt}"
    )
    # print(enriched_prompt)

    # Pass the prompt to the model
    response = query_model(enriched_prompt, config)
    return response

In [None]:
# Test the tool
prompt = "Who is Scrooge?"
result = agent_query_tool(prompt)
print(f"Model response: {result}")