In [1]:
# Import necessary libraries
import pinecone
import vertexai
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from langchain.chat_models import ChatVertexAI
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
    MessagesPlaceholder
)

In [5]:
import os
credential_path = "YOUR CREDENTIAL.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

In [6]:
import vertexai

PROJECT_ID = "YOUR_ID"  # @param {type:"string"}
REGION = "YOUR_REGION"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

In [9]:
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")

In [8]:
from pinecone import Pinecone
from pinecone import ServerlessSpec
import time

pc = Pinecone(api_key="PINECONE API KEY")
spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)
index_name = 'textbison-rag'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=768,  # dimensionality of text-bison@002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 67}},
 'total_vector_count': 67}

In [None]:
import pandas as pd

dataset = pd.read_csv("prudential.csv")
dataset["Id"] = dataset["Id"].map(str)

In [None]:
from tqdm.auto import tqdm  # for progress bar

data = dataset  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [x['Id'] for _, x in batch.iterrows()]
    # get text to embed
    texts = [x['Content'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['Content'],
        #  'source': x['source'],
        'title': x['Title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
ndex.describe_index_stats()

In [18]:
# Define utility functions
def text_embedding(input_text) -> list:
    embeddings = model.get_embeddings([input_text])
    for embedding in embeddings:
        vector = embedding.values
        print(f"Length of Embedding Vector: {len(vector)}")
    return vector

def find_match(input_text):
    input_em = text_embedding(input_text)
    result = index.query(vector=input_em, top_k=10, includeMetadata=True)
    return result['matches'][0]['metadata']['text'] + "\n" + result['matches'][1]['metadata']['text']

def query_refiner(conversation, query):
    model = TextGenerationModel.from_pretrained("text-bison@002")
    response = model.predict(
        prompt=f"Given the following user query and conversation log, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\n\nCONVERSATION LOG: \n{conversation}\n\nQuery: {query}\n\nRefined Query:",
        temperature=0.2,
        max_output_tokens=256,
        top_p=0.5,
        top_k=20
    )
    return response.text

In [19]:
# Initialize conversation components
llm = ChatVertexAI(model_name="chat-bison")
buffer_memory = ConversationBufferWindowMemory(k=3, return_messages=True)

system_msg_template = SystemMessagePromptTemplate.from_template(
    template="""Answer the question as truthfully as possible using the provided context, 
    and if the answer is not contained within the text below, say 'Sorry! I don't know'"""
)
human_msg_template = HumanMessagePromptTemplate.from_template(template="{input}")
prompt_template = ChatPromptTemplate.from_messages([
    system_msg_template,
    MessagesPlaceholder(variable_name="history"),
    human_msg_template
])
conversation = ConversationChain(memory=buffer_memory, prompt=prompt_template, llm=llm, verbose=True)


In [20]:
# Initialize conversation history
responses = ["How can I help you today?"]
requests = []

def get_conversation_history():
    conversation_string = ""
    for i in range(len(responses) - 1):
        conversation_string += "Human: " + requests[i] + "\n"
        conversation_string += "Bot: " + responses[i + 1] + "\n"
    return conversation_string


In [21]:
while True:
    query = input("Query: ")
    if query.lower() in ["exit", "quit"]:
        break

    # Process the query
    conversation_history = get_conversation_history()
    refined_query = query_refiner(conversation_history, query)
    print("\nRefined Query:")
    print(refined_query)
    
    context = find_match(refined_query)
    response = conversation.predict(input=f"Context:\n{context} \n\n Query:\n{query}")
    
    # Update conversation history
    requests.append(query)
    responses.append(response)

    # Display response
    print("\nBot:")
    print(response)


Refined Query:
 Apa itu Prudential Indonesia?
Length of Embedding Vector: 768


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Answer the question as truthfully as possible using the provided context, 
    and if the answer is not contained within the text below, say 'Sorry! I don't know'
Human: Context:
Beberapa produk yang ditawarkan oleh Prudential Indonesia yaitu: Asuransi Kesehatan, Asuransi Jiwa, Pendidikan, Perlindungan Bebas Premi, Produk Asuransi Yang Dikaitkan Investasi, Produk Syariah, Bancassurance, Perlindungan Karyawan.
PT Prudential Life Assurance (Prudential Indonesia) didirikan pada 1995 dan merupakan bagian dari Prudential PLC, yang menyediakan asuransi jiwa dan kesehatan serta manajemen aset, dengan berfokus di Asia dan Afrika. Dengan menggabungkan pengalaman internasional Prudential di bidang asuransi jiwa dengan pengetahuan tata cara bisnis lokal, Prudential Indonesia memiliki komitmen untuk mengembangkan bisnisnya 