In [None]:
# Recommended to be updated on daily basis because of the fast development
!pip install --upgrade langchain

In [None]:
from langchain.llms import VertexAI
import vertexai
from code_secrets import cridentials

# Replace with your project ID and region
PROJECT_NAME = cridentials.get('PROJECT_NAME')
LOCATION = cridentials.get('LOCATION')

vertexai.init(project=PROJECT_NAME, location=LOCATION)

In [None]:
from langchain.llms import VertexAI

# temperature range from 0 to 1 and lower values mean more conservative
# top_p range from 0 to 1 and lower values mean more conservative
# top_k range from 0 to 40 and lower values mean more conservative

llm = VertexAI(
    model_name="gemini-pro",
    max_output_tokens=200,
    temperature=0.3,
    top_p=0.5,
    top_k=8
)

### Chroma Versions Approach (Works Partially)

In [197]:
from helper_utils import word_wrap

In [3]:
from pypdf import PdfReader

reader = PdfReader("indonesia_personal_data_protection_googlecloud_whitepaper.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [5]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

In [6]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [7]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

In [8]:
# # Use to remove collection
# chroma_client.delete_collection("Data_Governance_Data_Extermination_BAPD")

In [9]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("indonesia_personal_data_protection_googlecloud_whitepaper", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)

In [None]:
query = "What are the first process of BAPD?"

results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print(document)

In [None]:
from langchain.llms import VertexAI
import vertexai
from code_secrets import cridentials

# Replace with your project ID and region
PROJECT_NAME = cridentials.get('PROJECT_NAME')
LOCATION = cridentials.get('LOCATION')

vertexai.init(project=PROJECT_NAME, location=LOCATION)

# temperature range from 0 to 1 and lower values mean more conservative
# top_p range from 0 to 1 and lower values mean more conservative
# top_k range from 0 to 40 and lower values mean more conservative

llm = VertexAI(
    model_name="gemini-pro",
    max_output_tokens=200,
    temperature=0.3,
    top_p=0.5,
    top_k=8
)

In [None]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [None]:
from vertexai.preview.generative_models import GenerativeModel, ChatSession

model = GenerativeModel("gemini-pro")
chat = model.start_chat()

def get_chat_response(chat: ChatSession, prompt: str, retrieved_documents=retrieved_documents) -> str:

    parameters = {
        "temperature": 0.3,  # Temperature controls the degree of randomness in token selection.
        "max_output_tokens": 200,  # Token limit determines the maximum amount of text output.
        "top_p": 0.5,  # Tokens are selected from most probable to least until the sum of their probabilities equals the top_p value.
        "top_k": 8,  # A top_k of 1 means the selected token is the most probable among all tokens.
    }
    
    information = "\n\n".join(retrieved_documents)
    response = chat.send_message(prompt, generation_config={"temperature": 0})
    return response.text

prompt = "Hello."
print(get_chat_response(chat, prompt))

In [None]:
output = get_chat_response(chat, prompt=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))

### Use Gemini Approach

Reference

https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/intro_multimodal_rag.ipynb

https://github.com/karndeepsingh/ApplicationsBuildWithLLMs/blob/main/Langchain_With_Gemini_And_Build_RAG.ipynb

https://cloud.google.com/blog/products/ai-machine-learning/generative-ai-applications-with-vertex-ai-palm-2-models-and-langchain

https://ai.google.dev/examples/vectordb_with_chroma

https://ai.google.dev/examples/doc_search_emb

https://colab.research.google.com/drive/1xdosZ6bScn5oHnFzGeReMCLXQWK7Inpf?usp=sharing

https://python.langchain.com/docs/use_cases/question_answering/

In [33]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import re

from pypdf import PdfReader

import vertexai
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
)
from vertexai.language_models import TextEmbeddingModel
from langchain.llms import VertexAI
from code_secrets import cridentials

# Replace with your project ID and region
PROJECT_NAME = cridentials.get('PROJECT_NAME')
LOCATION = cridentials.get('LOCATION')

vertexai.init(project=PROJECT_NAME, location=LOCATION)

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import VertexAIEmbeddings

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from IPython.display import Markdown
from helper_utils import word_wrap

In [34]:
reader = PdfReader("indonesia_personal_data_protection_googlecloud_whitepaper.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [35]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

In [36]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [37]:
embedding_function = SentenceTransformerEmbeddingFunction()

In [38]:
# Use to remove collection
chroma_client.delete_collection("indonesia_personal_data_protection_googlecloud_whitepaper")

In [39]:
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()
    db = chroma_client.create_collection(name=name, embedding_function=embedding_function())
    for i, d in enumerate(documents):
        db.add(
            documents=d,
            ids=str(i)
        )
    return db

In [40]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("Data_Governance_Data_Extermination_BAPD", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)

In [41]:
pd.DataFrame(chroma_collection.peek(3))

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0,"[-0.08090899884700775, -0.02253359742462635, 0...",,google cloud whitepaper mar ch 2023 i n d o n ...,,
1,1,"[-0.02525135688483715, -0.03832373768091202, 0...",,g o o g l e c l o u d t a b l e o f c o n t e ...,,
2,10,"[-0.04223982244729996, -0.04441278055310249, 0...",,p r o c e s s o r s ” ( “ p r o c e s s o r s ...,,


In [42]:
def get_relevant_passage(query, db):
    passage = db.query(query_texts=[query], n_results=5)['documents'][0]
    return passage

In [43]:
# Perform embedding search
passage = get_relevant_passage("Data Governance", chroma_collection)
print(passage)

['g o o g l e c l o u d e n u m e r a t e d s e t o f p r o c e s s i n g p r i n c i p l e s, i n c l u d i n g t h a t o r g a n i z a t i o n s m u s t n o t i f y d a t a s u b j e c t s o f t h e p u r p o s e s f o r w h i c h t h e y p r o c e s s p e r s o n a l d a t a, m u s t p r o c e s s p e r s o n a l d a t a i n a l i m i t e d, s p e c i ﬁ c, t r a n s p a r e n t, a n d l a w f u l m a n n e r, a n d m u s t pr otect the security of personal data fr om unauthoriz ed access, unauthoriz ed disclosur e, unauthoriz ed alter ation, misuse,', "o y o u a b o u t y o u r d a t a y o u r d a t a i s c r i t i c a l t o y o u r b u s i n e s s, a n d y o u t a k e g r e a t c a r e t o k e e p i t s a f e a n d u n d e r y o u r c o n t r o l. w e w a n t y o u t o f e e l c o n ﬁ d e n t t h a t t a k i n g a d v a n t a g e o f g o o g l e w o r k s p a c e a n d g o o g l e c l o u d s e r v i c e s d o e sn't r e q u i r e y o u t o c o m p r o m i s e o n s e c u r i t y o

In [44]:
def make_prompt(query, relevant_passage):
    # escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    # escaped = "\n\n".join(relevant_passage)
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'
    
    ANSWER:
    """).format(query=query, relevant_passage=relevant_passage)
    
    return prompt

In [47]:
query = "What is indonesia pdp law?"
prompt = make_prompt(query, passage)
Markdown(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.     Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.     However, you are talking to a non-technical audience, so be sure to break down complicated concepts and     strike a friendly and converstional tone.     If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: 'What is indonesia pdp law?'
    PASSAGE: '['g o o g l e c l o u d e n u m e r a t e d s e t o f p r o c e s s i n g p r i n c i p l e s, i n c l u d i n g t h a t o r g a n i z a t i o n s m u s t n o t i f y d a t a s u b j e c t s o f t h e p u r p o s e s f o r w h i c h t h e y p r o c e s s p e r s o n a l d a t a, m u s t p r o c e s s p e r s o n a l d a t a i n a l i m i t e d, s p e c i ﬁ c, t r a n s p a r e n t, a n d l a w f u l m a n n e r, a n d m u s t pr otect the security of personal data fr om unauthoriz ed access, unauthoriz ed disclosur e, unauthoriz ed alter ation, misuse,', "o y o u a b o u t y o u r d a t a y o u r d a t a i s c r i t i c a l t o y o u r b u s i n e s s, a n d y o u t a k e g r e a t c a r e t o k e e p i t s a f e a n d u n d e r y o u r c o n t r o l. w e w a n t y o u t o f e e l c o n ﬁ d e n t t h a t t a k i n g a d v a n t a g e o f g o o g l e w o r k s p a c e a n d g o o g l e c l o u d s e r v i c e s d o e sn't r e q u i r e y o u t o c o m p r o m i s e o n s e c u r i t y o r c o n t r o l o f y o u r", 'g o o g l e c l o u d c o n c l u s i o n a t g o o g l e, w e r e c o g n i z e t h a t y o u r d a t a i s y o u r s o n l y a n d g u a r a n t e e i n g t h e p r i v a c y o f y o u r d a t a i s k e y. t h e p r o t e c t i o n o f y o u r d a t a i s a p r i m a r y d e s i g n c o n s i d e r a t i o n f o r a l l o u r i n f r a s t r u c t u r e, p r o d u c t s a n d p e r s o n n e l o p e r a t i o n s. w e b e l i e v e t h a t g o o g l e c a n o f f e r a l e v e l o f p r o t e c t i o', 'd a t a w e p r o c e s s a c c o r d i n g t o y o u r g o o g l e c l o u d a g r e e m e n t ( s ). 1 i n t h i s w h i t e p a p e r, “ y o u / y o u r ” r e f e r s t o g o o g l e c l o u d a n d g o o g l e w o r k s p a c e c u s t o m e r s a s w e l l a s g o o g l e c l o u d p a r t n e r s. u n l e s s i n d i c a t e d o t h e r w i s e, r e f e r e n c e s t o “ c u s t o m e r s ” w i l l i n c l u d e g o o g l e c l o u d p a r t n e r s a n d r e f e r e n c e s t o “ c u s t o m e r', 'g o o g l e c l o u d c o n t r o l o f r e s o u r c e p e r m i s s i o n s. f o r e x a m p l e, u s i n g c l o u d i d e n t i t y a n d a c c e s s m a n a g e m e n t, c u s t o m e r s c a n m a p j o b f u n c t i o n s t o g r o u p s a n d r o l e s s o u s e r s o n l y a c c e s s t h e d a t a t h e y n e e d t o g e t t h e j o b d o n e. f u r t h e r m o r e, c u s t o m e r s m a y d e l e t e c u s t o m e r d a t a f r o m o u r s y s t e m s o r t a k e i t w i t h t h e m i f t h']'
    
    ANSWER:
    

In [48]:
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

I apologize, but I can't answer that question. The text provided does not contain any information about 'Indonesia PDP law'.

### Combination with Chromadb

#### Expansion with generated answers

In [200]:
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']

In [243]:
def remove_hyphens(text):
    return re.sub(r'-', '', text)

def augment_multiple_query(query, model, topic):
    prompt = ("""Suggest up to five additional short, related questions to help them find the information they need, covering different aspects of the topic. Output one question per line. Do not hyphen or number the questions.
    QUESTION: '{query}'
    TOPIC: '{topic}'
    
    ANSWER:
    """).format(query=query, topic=topic)
    
    model = GenerativeModel(model)
    answer = model.generate_content(prompt)
    
    answer = answer.text
    answer.split("\n")

    clean_text = remove_hyphens(answer)
    sentences = [line for line in clean_text.splitlines()]
    
    return sentences

def get_augment_multiple_retrived(original_query, model, topic, chroma_collection):
    
    augmented_queries = augment_multiple_query(original_query, model, topic)
    
    queries = [original_query] + augmented_queries
    results = chroma_collection.query(query_texts=queries, n_results=5)
    
    retrieved_documents = results['documents']
    
    unique_documents = set()
    for documents in retrieved_documents:
        for document in documents:
            unique_documents.add(document)
    
    unique_documents = list(unique_documents)
    
    return unique_documents

In [201]:
original_query = "After i got the signing of Business Users?, what are the next process"
augmented_queries = augment_multiple_query(original_query, 'gemini-pro', 'Process')

augmented_queries

[' What is the signoff process for nonBusiness Users?',
 ' What are the different types of documents that need to be signed off on?',
 ' Who is responsible for obtaining signoffs?',
 ' What are the consequences of not obtaining a signoff?',
 ' What is the best way to track the status of signoffs?']

In [245]:
queries = [original_query] + augmented_queries
results = chroma_collection.query(query_texts=queries, n_results=5, include=['documents', 'embeddings'])

In [246]:
retrieved_documents = results['documents']

# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents

{'.................................................................. 3 1. environment and data assessment & bapd document drafting process................................... 4 2. data extermination & bapd document finalization process....................................................... 5 3. bapd document approval process................................................................',
 '................................................................................. iii general overview of data governance & policy sop................................................................................... 1 data extermination and bapd document creation sop.....................................................................',
 "1 general overview of data governance & policy sop data governance sop ensure s that data is managed effectively throughout the project's lifecycle. this sop should include clear guidelines and responsibilities for managing data, starting from data collection unt

In [247]:
original_query = "Siapakah yang akan menang pemilu 2024"
passage = get_augment_multiple_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [248]:
prompt = make_prompt(original_query, passage)

In [249]:
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

I'm sorry, but the document you shared does not contain any information about the winner of the 2024 election. Therefore, I can't answer your question.

#### Re-ranking with Query Expansion

In [252]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [253]:
original_query = "Siapakah yang akan menang pemilu 2024"
passage = get_augment_multiple_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [254]:
pairs = []
for doc in passage:
    pairs.append([original_query, doc])

In [255]:
scores = cross_encoder.predict(pairs)

In [257]:
pairs = [[query, doc] for doc in passage]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
-10.023334
-11.504522
-10.65059
-11.47018
-11.017712
-11.109238
-11.392984
-10.988883
-11.515486


In [258]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
1
3
8
5
6
7
4
2
9


In [281]:
def remove_hyphens(text):
    return re.sub(r'-', '', text)

def augment_multiple_query(query, model, topic):
    prompt = ("""Suggest up to five additional short, related questions to help them find the information they need, covering different aspects of the topic. Output one question per line. Do not hyphen or number the questions.
    QUESTION: '{query}'
    TOPIC: '{topic}'
    
    ANSWER:
    """).format(query=query, topic=topic)
    
    model = GenerativeModel(model)
    answer = model.generate_content(prompt)
    
    answer = answer.text
    answer.split("\n")

    clean_text = remove_hyphens(answer)
    sentences = [line for line in clean_text.splitlines()]
    
    return sentences

def get_augment_multiple_rerank_retrived(original_query, model, topic, chroma_collection):
    
    augmented_queries = augment_multiple_query(original_query, model, topic)
    
    queries = [original_query] + augmented_queries
    results = chroma_collection.query(query_texts=queries, n_results=5)
    
    retrieved_documents = results['documents']
    
    unique_documents = set()
    for documents in retrieved_documents:
        for document in documents:
            unique_documents.add(document)
    
    unique_documents = list(unique_documents)
    
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    pairs = []
    for doc in unique_documents:
        pairs.append([original_query, doc])
        
    scores = cross_encoder.predict(pairs)
    
    pairs = [[query, doc] for doc in unique_documents]
    scores = cross_encoder.predict(pairs)
    
    sorted_indices = np.argsort(scores)[::-1].tolist()
    reordered_list = sorted(unique_documents, key=lambda x: sorted_indices.index(unique_documents.index(x)))

    return reordered_list

In [282]:
original_query = "After i got the signing of Business Users?, what are the next process"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [283]:
prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

Following the signing of the Business Users, the next stage is to request the Project Manager to forward the Business Activity Processing Document (BAPD) document to the Department Head for signature. Once the Department Head signs the BAPD, it should be uploaded to both the Project SharePoint and DGE SharePoint. Finally, an email along with a BAPD attachment should be sent to SME ADI, requesting their signature.

### End Result

In [1]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
import re

from pypdf import PdfReader

import vertexai
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
)
from vertexai.language_models import TextEmbeddingModel
from langchain.llms import VertexAI
from code_secrets import cridentials

# Replace with your project ID and region
PROJECT_NAME = cridentials.get('PROJECT_NAME')
LOCATION = cridentials.get('LOCATION')

vertexai.init(project=PROJECT_NAME, location=LOCATION)

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import VertexAIEmbeddings

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from IPython.display import Markdown
from helper_utils import word_wrap

In [2]:
reader = PdfReader("indonesia_personal_data_protection_googlecloud_whitepaper.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [3]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [4]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("indonesia_personal_data_protection_googlecloud_whitepaper", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)

In [5]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [8]:
def make_prompt(query, relevant_passage):
    # escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    # escaped = "\n\n".join(relevant_passage)
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'
    
    ANSWER:
    """).format(query=query, relevant_passage=relevant_passage)
    
    return prompt

def remove_hyphens(text):
    return re.sub(r'-', '', text)

def augment_multiple_query(query, model, topic):
    prompt = ("""Suggest up to five additional short, related questions to help them find the information they need, covering different aspects of the topic. Output one question per line. Do not hyphen or number the questions.
    QUESTION: '{query}'
    TOPIC: '{topic}'
    
    ANSWER:
    """).format(query=query, topic=topic)
    
    model = GenerativeModel(model)
    answer = model.generate_content(prompt)
    
    answer = answer.text
    answer.split("\n")

    clean_text = remove_hyphens(answer)
    sentences = [line for line in clean_text.splitlines()]
    
    return sentences

def get_augment_multiple_rerank_retrived(original_query, model, topic, chroma_collection):
    
    augmented_queries = augment_multiple_query(original_query, model, topic)
    
    queries = [original_query] + augmented_queries
    results = chroma_collection.query(query_texts=queries, n_results=5)
    
    retrieved_documents = results['documents']
    
    unique_documents = set()
    for documents in retrieved_documents:
        for document in documents:
            unique_documents.add(document)
    
    unique_documents = list(unique_documents)
    
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    pairs = []
    for doc in unique_documents:
        pairs.append([original_query, doc])
        
    scores = cross_encoder.predict(pairs)
    
    pairs = [[original_query, doc] for doc in unique_documents]
    scores = cross_encoder.predict(pairs)
    
    sorted_indices = np.argsort(scores)[::-1].tolist()
    reordered_list = sorted(unique_documents, key=lambda x: sorted_indices.index(unique_documents.index(x)))

    return reordered_list

In [9]:
original_query = "what is indonesia pdp law?"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [10]:
prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

The Personal Data Protection Law of Indonesia (PDP Law), enforced on October 17, 2022, regulates the collection, processing, and responsible use of personal data within Indonesia. This law aims to protect the privacy of individuals and ensure that their personal information is handled in a lawful, fair, and accountable manner.

In [11]:
original_query = "what is controller and processor?"
passage = get_augment_multiple_rerank_retrived(original_query, 'gemini-pro', 'Process', chroma_collection)

In [12]:
prompt = make_prompt(original_query, passage)
model = GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

I apologize, but the passage does not contain any information about controllers and processors. Therefore, I cannot answer your question.