# Team 2 - NUS ISS Assignement - RAG 

Context: We want to build a LLM using RAG Bookstore to recommend users books to read accordingly to their interest.

In [38]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, GenerationConfig

In [35]:
# Setting up of the LLM Model 
# We Are using google/flan-t5-small
model_name = "google/flan-t5-small"

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# I am here


In [10]:
# Load content from the EPUB file
# epub_loader = UnstructuredEPubLoader(file_path='data/charles-dickens_a-christmas-carol.epub')
# epub_loader = UnstructuredEPubLoader(file_path='data/the_gift_of_the_magi.epub')
# epub_loader = UnstructuredEPubLoader(file_path='data/the_happy_prince.epub')
epub_loader = UnstructuredEPubLoader(file_path='data/the_nightingale_and_the_rose.epub')
doc = epub_loader.load()

print(doc)

  data file translations/en.yaml not found



[Document(metadata={'source': 'data/the_nightingale_and_the_rose.epub'}, page_content='The Nightingale and the Rose.\n\nDecorative graphic of young man lying on grass\n\n“She said that she would dance with me if I brought her red roses,” cried the young Student; “but in all my garden there is no red rose.”\n\nFrom her nest in the holm-oak tree the Nightingale heard him, and she looked out through the leaves, and wondered.\n\n“No red rose in all my garden!” he cried, and his beautiful eyes filled with tears. “Ah, on what little things does happiness depend! I have read all that the wise men have written, and all the secrets of philosophy are mine, yet for want of a red rose is my life made wretched.”\n\n“Here at last is a true lover,” said the Nightingale. “Night after night have I sung of him, though I knew him not: night after night have I told his story to the stars, and now I see him. His hair is dark as the hyacinth-blossom, and his lips are red as the rose of his desire; but passi

In [None]:
# Create a text splitter to break down the document into manageable chunks
chunks_size = 1024
chunks_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunks_size,
    chunk_overlap=chunks_overlap
)
chunks = text_splitter.split_documents(doc)

In [36]:
embed_model_name = 'BAAI/bge-small-en-v1.5'

In [40]:
# Create embedding model
embed_model_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=embed_model_name
)

In [41]:
# Crepare the chunks for inserting into Chroma
texts = [d.page_content for d in chunks]

# Generate PK for texts
texts_ids = [str(uuid4())[:8] for _ in range(len(texts))]

In [42]:
# Create ephemeral Chroma client and save chunks
collection_name = 'epub'

# Create a Chroma client
chroma_client = chromadb.Client()

# Create a embeeding function
embed_model_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

try:
    # Clean up collection
    chroma_client.delete_collection(name=collection_name)
except Exception as e:
    pass

In [43]:
# Insert the chunks into collection
collection = chroma_client.create_collection(
    name=collection_name,
    embedding_function=embed_model_func
)

# If the document <=0, than we load
if collection.count() == 0:
    print("Inserting chunks document into Chroma collection...")
    collection.add(
        documents=texts,
        ids=texts_ids
    )

print(f"Number of documents in collection '{collection_name}': {collection.count()}")

Inserting chunks document into Chroma collection...
Number of documents in collection 'epub': 14


In [47]:
top_k = 5
question = "Who is Scrooge?"

results = collection.query(
    query_texts=[question],
    n_results=top_k
)

context= ""
for id in results['ids'][0]:
    doc = collection.get(ids=[id])
    context += doc['documents'][0] + "\n"
    
# Create the LLM query here
llm_query = f"Answer based on context :\n\n {context}\nTop {top_k} results from the ChromaDb database based on the question\n{question}"


In [48]:
#Pass the query to the LLM to retrieve the results

config = GenerationConfig(
    do_sample = True,
    temperature= 0.7,
    top_k= 1
)

input = tokenizer(llm_query, return_tensors="pt").input_ids

enc_summary = model.generate(input, generation_config = config)

summary = tokenizer.decode(enc_summary[0], skip_special_tokens=True)
print(summary)


a student
