In [None]:
# Install packages 
# Run this if your kernel packages is empty.
# Leverage on https://colab.research.google.com/github/GSCCLIVE/nus-iss-rag-assignment/blob/main/test.ipynb#scrollTo=eufh-4yZlCOn
!pip3 install --upgrade pip
!pip install chromadb pypandoc
!pip install transformers datasets evaluate rouge_score loralib peft 
!pip3 install ipykernel ipywidgets 
!pip3 install langchain-community sentence-transformers unstructured
!pip3 install diffusers accelerate scipy safetensors
!pip3 install torch torchdata torchvision
!pip3 install smolagents openai
!pip3 install nbconvert[webpdf]
!pip3 huggingface_hub[hf_xet]

!pip3 install unstructured 
!pip3 install pandas networkx openpyxl
!pip3 install python-magic python-pptx
!pip3 install docx2txt docx
!pip3 install jq nltk
!pip3 install duckduckgo_search

In [2]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from datasets import load_dataset

import pprint

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, GenerationConfig

In [None]:
# Test Loading Dataset with meta data
dataset = load_dataset("IsmaelMousa/books", split="train")
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(dataset[0])

In [6]:
#import the embedding model
embed_model_name = "all-MiniLM-L6-v2"
embed_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

In [None]:
# Chunk only the text and add chunks with metadata to ChromaDB

chunks_size = 1024
chunks_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunks_size,
    chunk_overlap=chunks_overlap
)

all_chunk_texts = []
all_chunk_metadatas = []
all_chunk_ids = []

for i, row in enumerate(dataset):
    # Chunk only the book text
    chunks = text_splitter.split_text(row['EN'])
    for j, chunk in enumerate(chunks):
        all_chunk_texts.append(chunk)
        all_chunk_metadatas.append({
            "title": row["title"],
            "author": row["author"],
            "category": row["category"]
        })
        all_chunk_ids.append(f"{i}_{j}")
        
# check the count of chunks
print(len(all_chunk_texts))


In [None]:
# Create ChromaDB collection and add chunks
collection_name = 'books'
client = chromadb.Client()
try:
    client.delete_collection(name=collection_name)
except Exception:
    pass

collection = client.create_collection(
    name=collection_name,
    embedding_function=embed_model,
)

batch_size = 128  # Number of rows to process at once
chunk_batch_size = 128  # Number of chunks to add at once

for batch_start in range(0, len(dataset), batch_size):
    batch_end = min(batch_start + batch_size, len(dataset))
    batch = dataset.select(range(batch_start, batch_end))
    all_chunk_texts = []
    all_chunk_metadatas = []
    all_chunk_ids = []
    for i, row in enumerate(batch, start=batch_start):
        chunks = text_splitter.split_text(row['EN'])
        for j, chunk in enumerate(chunks):
            all_chunk_texts.append(chunk)
            all_chunk_metadatas.append({
                "title": row["title"],
                "author": row["author"],
                "category": row["category"]
            })
            all_chunk_ids.append(f"{i}_{j}")
            # Add to collection in sub-batches
            if len(all_chunk_texts) >= chunk_batch_size:
                collection.add(
                    documents=all_chunk_texts,
                    ids=all_chunk_ids,
                    metadatas=all_chunk_metadatas
                )
                all_chunk_texts = []
                all_chunk_metadatas = []
                all_chunk_ids = []
    # Add any remaining chunks in this batch
    if all_chunk_texts:
        collection.add(
            documents=all_chunk_texts,
            ids=all_chunk_ids,
            metadatas=all_chunk_metadatas
        )
    print(f"Processed batch {batch_start} to {batch_end}")

In [22]:
# Setting up of the LLM Model 
# We Are using google/flan-t5-small
model_name = "google/flan-t5-small"

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [29]:
query = "What is the genre of the book 'The Picture of Dorian Gray'?"

results = collection.query(
    query_texts=[query],
    n_results=5
)

In [None]:
# Print the results
pp.pprint(results)