# RAG Implementation

## Import Library

In [1]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import re

from pypdf import PdfReader

import vertexai
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
)
from vertexai.language_models import TextEmbeddingModel
from langchain.llms import VertexAI

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import VertexAIEmbeddings

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from IPython.display import Markdown
from utils.helper_utils import word_wrap

## Data Preprocessing

In [2]:
#read and extract text from pdf
reader = PdfReader("data_similar/indonesia_personal_data_protection_googlecloud_whitepaper.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [3]:
#chunking
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [4]:
#embedding
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("indonesia_personal_data_protection_googlecloud_whitepaper", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)

In [5]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

## Prompt Setting and Augmentation

In [6]:
def make_prompt(query, relevant_passage):
    """
    This function constructs a formatted prompt (essentially a question and answer template) 
    to be used with a large language model.
    Args : 
        query: The user's question that needs to be answered.
        relevant_passage: A piece of text (potentially retrieved from a document) 
        that might be relevant to answering the query.
    Returns:
        The formatted prompt string that can be used with an LLM to generate an answer.
    """
    # escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    # escaped = "\n\n".join(relevant_passage)
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'
    
    ANSWER:
    """).format(query=query, relevant_passage=relevant_passage)
    
    return prompt

def remove_hyphens(text):
    """
    This function is a simple text processing function that removes hyphens from a given text string.
    Args:
        text: The text string from which hyphens need to be removed.
    Returns:
        The new text string without hyphens.
    """
    return re.sub(r'-', '', text)

def augment_multiple_query(query, model, topic):
    """
    This function generates additional related questions based on an original query, 
    likely using a large language model.
    Args : 
        query: The original user query.
        model: An instance of a GenerativeModel class 
        (likely a pre-trained LLM for generating text).
        topic: The topic associated with the original query.
    Returns :
        A list of strings, where each string represents 
        a single augmented (related) question generated by the LLM.
    """
    prompt = ("""Suggest up to five additional short, related questions to help them find the information they need, covering different aspects of the topic. Output one question per line. Do not hyphen or number the questions.
    QUESTION: '{query}'
    TOPIC: '{topic}'
    
    ANSWER:
    """).format(query=query, topic=topic)
    
    model = GenerativeModel(model)
    answer = model.generate_content(prompt)
    
    answer = answer.text
    answer.split("\n")

    clean_text = remove_hyphens(answer)
    sentences = [line for line in clean_text.splitlines()]
    
    return sentences

def get_augment_multiple_rerank_retrived(original_query, model, topic, chroma_collection):
    """
    This function takes an original query, leverages an LLM to generate related questions, 
    retrieves potentially relevant documents from a collection, 
    and re-ranks them based on their similarity to the original query.
    Args : 
        original_query: The user's initial question.
        model: An instance of a GenerativeModel class (likely a pre-trained LLM).
        topic: The topic associated with the original query.
        chroma_collection: A ChromaDB collection object that presumably stores documents or text data
    """
    augmented_queries = augment_multiple_query(original_query, model, topic)
    
    queries = [original_query] + augmented_queries
    results = chroma_collection.query(query_texts=queries, n_results=5)
    
    retrieved_documents = results['documents']
    
    unique_documents = set()
    for documents in retrieved_documents:
        for document in documents:
            unique_documents.add(document)
    
    unique_documents = list(unique_documents)
    
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    pairs = []
    for doc in unique_documents:
        pairs.append([original_query, doc])
        
    scores = cross_encoder.predict(pairs)
    
    pairs = [[original_query, doc] for doc in unique_documents]
    scores = cross_encoder.predict(pairs)
    
    sorted_indices = np.argsort(scores)[::-1].tolist()
    reordered_list = sorted(unique_documents, key=lambda x: sorted_indices.index(unique_documents.index(x)))

    return reordered_list

## Query test

In [7]:
original_query = "what is indonesia pdp law?"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [8]:
prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

The Indonesia Personal Data Protection Law (PDPL) was enacted on October 17, 2022. It regulates the collection, use, disclosure, storage, and disposal of personal data by organizations operating in Indonesia. The law aims to protect the privacy rights of individuals and ensure the responsible handling of personal data. The PDPL applies to organizations that process personal data of Indonesian citizens or residents, regardless of the location of the organization. It establishes obligations for organizations to obtain consent from individuals before collecting their personal data, to protect the data from unauthorized access or use, and to dispose of it securely when it is no longer needed. The PDPL also provides rights to individuals to access, correct, or delete their personal data, and to complain to the Indonesian Personal Data Protection Commission if they believe their rights have been violated.

In [11]:
original_query = "what is controller and processor?"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

A 'controller' is a party that processes data on its own or together with others, in order to fulfill purposes it has designated. A 'processor' is a natural or legal person who processes personal data on behalf of the controller. The controller is legally responsible for compliance with the law, determining the purpose and means of processing, and ensuring that the processor acts in accordance with the controller's instructions.

In [10]:
original_query = "Who enacted the pdp law?"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

I apologize, but I cannot provide an answer to your question based on the information provided in the passage. There is no specific mention of who enacted the pdp law.