# Team 2 - NUS ISS Assignement - RAG & Agentic AI

## Context: 
The Goal of the project is to integrate RAG (Retrieval-Augmented Generation) techniques with Agentic AI capabilities to create a more interactive and personalized user experience in book recommendations. 

## RAG

In [None]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from datasets import load_dataset

import pprint

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, GenerationConfig

from smolagents import tool, Tool, CodeAgent, OpenAIServerModel
from dotenv import load_dotenv

In [None]:
# Test Loading Dataset with meta data
dataset = load_dataset("IsmaelMousa/books", split="train")
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(dataset[0])

In [None]:
#import the embedding model
embed_model_name = "all-MiniLM-L6-v2"
embed_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

In [None]:
# Chunk only the text and add chunks with metadata to ChromaDB

chunks_size = 1024
chunks_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunks_size,
    chunk_overlap=chunks_overlap
)

all_chunk_texts = []
all_chunk_metadatas = []
all_chunk_ids = []

for i, row in enumerate(dataset):
    # Chunk only the book text
    chunks = text_splitter.split_text(row['EN'])
    for j, chunk in enumerate(chunks):
        all_chunk_texts.append(chunk)
        all_chunk_metadatas.append({
            "title": row["title"],
            "author": row["author"],
            "category": row["category"]
        })
        all_chunk_ids.append(f"{i}_{j}")
        
# check the count of chunks
print(len(all_chunk_texts))


In [None]:
# Create ChromaDB collection and add chunks
collection_name = 'books'
client = chromadb.Client()
try:
    client.delete_collection(name=collection_name)
except Exception:
    pass

collection = client.create_collection(
    name=collection_name,
    embedding_function=embed_model,
)

batch_size = 128  # Number of rows to process at once
chunk_batch_size = 128  # Number of chunks to add at once

for batch_start in range(0, len(dataset), batch_size):
    batch_end = min(batch_start + batch_size, len(dataset))
    batch = dataset.select(range(batch_start, batch_end))
    all_chunk_texts = []
    all_chunk_metadatas = []
    all_chunk_ids = []
    for i, row in enumerate(batch, start=batch_start):
        chunks = text_splitter.split_text(row['EN'])
        for j, chunk in enumerate(chunks):
            all_chunk_texts.append(chunk)
            all_chunk_metadatas.append({
                "title": row["title"],
                "author": row["author"],
                "category": row["category"]
            })
            all_chunk_ids.append(f"{i}_{j}")
            # Add to collection in sub-batches
            if len(all_chunk_texts) >= chunk_batch_size:
                collection.add(
                    documents=all_chunk_texts,
                    ids=all_chunk_ids,
                    metadatas=all_chunk_metadatas
                )
                all_chunk_texts = []
                all_chunk_metadatas = []
                all_chunk_ids = []
    # Add any remaining chunks in this batch
    if all_chunk_texts:
        collection.add(
            documents=all_chunk_texts,
            ids=all_chunk_ids,
            metadatas=all_chunk_metadatas
        )
    print(f"Processed batch {batch_start} to {batch_end}")

## Tools Setup

In [None]:
@tool
def search_book_metadata(title: str = None, author: str = None, genre: str = None) -> list:
    """
    Search books by metadata such as title, author and genre. Returns a list of dictionaries with the following keys: title, author, genre

    Args:
        title: The title of the book to be searched
        author: The author to be searched
        genre: The genre of the book to be searched

    Returns:
        list: a list of dictionaries with the following keys: title, author, genre.

    Example:
        result = search_book_metadata("Lord of the Rings", "Tolkien", "Fantasy")
    """

    top_k=10000
    # Build dynamic Chroma 'where' clause
    where_clause = {}

    if title:
        where_clause["title"] = {"$eq": title}
    if author:
        where_clause["author"] = {"$eq": author}
    if genre:
        where_clause["category"] = {"$eq": genre}

    if not where_clause:
        return ["Please provide at least one of: title, author, or genre."]

    results = collection.get(
        where=where_clause,
        limit=top_k
    )

    if not results["metadatas"]:
        return ["No books found matching that criteria."]

    seen_titles = set()
    unique_results = []
    n_desired = 10
 
    for meta in results["metadatas"]:
        title = meta.get("title")
        if title not in seen_titles:
            seen_titles.add(title)
            unique_results.append (
                {
                    "title": meta.get("title"),
                    "author": meta.get("author"),
                    "genre": meta.get("category")
                }   
            )
        if len(unique_results) == n_desired:
            break

    return unique_results


In [None]:
#print(search_book_metadata('', '', 'Fantasy'))
#print(search_book_metadata(title="Robinson Crusoe", author="Daniel Defoe", genre="Adventure"))
print(search_book_metadata(title="", author="", genre="Science Fiction"))

In [None]:
@tool
def search_book_embedding(query: str) -> str:
    """
    Perform a chroma query on the book collection. Returns a concatenate string of the results to be used as context.

    Args:
        query: A semantic search on book content

    Returns:
        str: a string of all results for the search, to be used as context.

    Example:
        result = search_book_embedding("Who is Gandalf?")
    """

    top_k=5

    results = collection.query(
        query_texts = [query],
        n_results = top_k # Number of top results to return
    )
    context = ''

    for id in results['ids'][0]:
        c = collection.get(id)
        context += c['documents'][0]

    return context 


In [None]:
print(search_book_embedding('Who is Robinson Crusoe?'))

## Agentic AI Implementation

In [None]:
#Initialize

result = load_dotenv()
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)
model = 'gpt-4.1-mini'

In [None]:
#Create the agent's brain
agent_model = OpenAIServerModel(model_id=model, api_key=OPENAI_API_KEY)
#Instantiate agent
tools = [search_book_metadata, search_book_embedding ]

agent = CodeAgent(tools=tools, add_base_tools=False, model=agent_model)

In [None]:
#To Run Agent

#agent.run('Who is Robinson Crusoe? Do not check the internet, only use Tools', max_steps=5)
#agent.run('Any Mystery novels to recommend?', max_steps=5)
#agent.run('Who wrote The Invisible Man?', max_steps=5)
#agent.run('What else did H.G. Wells write?', max_steps=5)
agent.run('What happened at the end of the book Robinson Crusoe? Do not check the internet, only use Tools', max_steps=5)



In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create widgets
text_box = widgets.Text(
    value='',
    placeholder='Enter your book preference...',
    description='Query:',
    disabled=False
)

button = widgets.Button(description="Submit")
output = widgets.Output()

# Define callback
def on_button_clicked(b):
    with output:
        clear_output()  # Clears previous output every time you press submit
        agent.run(text_box.value, max_steps=5)

# Wire up
button.on_click(on_button_clicked)

# Display everything
display(text_box, button, output)
