In [1]:
# Run in Jupyter Notebook or Google Colab for smoothest experience
# Install required libraries
!pip install langchain duckdb faiss-cpu langchain_experimental time 
!pip install transformers pandas langchain-huggingface python-dotenv dotenv
!pip install langchain_community langchain_groq langchain_core sentence-transformers

zsh:1: /Users/abraham/PycharmProjects/Guan's Notebook/.venv/bin/./pip: bad interpreter: /Users/abraham/PycharmProjects/llm-mvp-dataRetrieval/.venv/bin/python: no such file or directory
zsh:1: /Users/abraham/PycharmProjects/Guan's Notebook/.venv/bin/./pip: bad interpreter: /Users/abraham/PycharmProjects/llm-mvp-dataRetrieval/.venv/bin/python: no such file or directory
zsh:1: /Users/abraham/PycharmProjects/Guan's Notebook/.venv/bin/./pip: bad interpreter: /Users/abraham/PycharmProjects/llm-mvp-dataRetrieval/.venv/bin/python: no such file or directory


### 1. Import CSV file data

In [2]:
# Read the CSV file
# You can change this to your own CSV file containing the text data you want to analyze.

import pandas as pd

# Load the CSV file
df = pd.read_csv('StockbitFAQ.csv')

df.head()

Unnamed: 0,id,slug,title,description,content,published_at,created_at,updated_at
0,0027af6a-4f47-4812-97aa-97971af835f0,cara-cancel-dan-amend-order-saham,Cara Cancel dan Amend Order Saham,,"Sebelum melakukan cancel atau amend, pastikan ...",09/27/2023 08:42:27,09/25/2023 08:13:45,09/27/2023 08:42:27
1,00d087ff-31a4-4098-ab11-c15f91b0de88,transfer-pembelian-reksa-dana-pakai-aplikasi-dana,Transfer Pembelian Reksa Dana Pakai Aplikasi Dana,Panduan Pembelian Reksa Dana Menggunakan Aplik...,### Panduan Pembelian Reksa Dana dan Transfer ...,07/22/2019 07:36:28,07/22/2019 07:00:35,09/16/2022 07:06:11
2,00fa91b9-4b6a-4936-a824-40461faac0be,kenapa-saldo-investasi-saya-di-bibit-dan-akses...,Kenapa Saldo Investasi Saya di Bibit dan AKSes...,Penjelasan mengenai perbedaan saldo Innvestasi...,Saldo portofolio reksa dana kamu baru akan diu...,02/10/2021 05:07:13,02/10/2021 04:53:28,07/13/2022 02:59:30
3,017b51a4-b964-43ca-9d3d-4c24a54f2097,kapan-penjualan-selesai-dan-dapat-harga-jual-h...,Kapan Penjualan Selesai dan Dapat Harga Jual H...,Penjelasan mengenai waktu penjualan selesai.,${youtube}[Video Penjelasan Waktu Penyelesaian...,12/19/2018 09:17:26,12/12/2018 07:48:58,10/09/2023 02:50:07
4,01b3c6c0-b061-45e1-b16a-aadf72175264,panduan-set-pembelian-rutin-nabung-rutin,Panduan Set Pembelian Rutin (Nabung Rutin),,### Apa itu Fitur Nabung Rutin?\nFitur nabung ...,02/07/2024 08:25:33,02/07/2024 07:53:11,03/08/2024 06:11:30


### 2. Chunking the Data

In [3]:
import pandas as pd
from langchain.schema import Document 

# Assuming df is your DataFrame
# The 'content' column is assumed to contain the text data, and 'title' is the key
text_column = 'content'
title_column = 'title'

# Make a custom text splitter function
def custom_text_splitter(text, chunk_size, chunk_overlap):
    start = 0
    chunks = []
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap  # Move start back by chunk_overlap to create overlap
    return chunks

split_data = []

for index, row in df.iterrows():
    title = row[title_column]
    content = row[text_column]
    # Ensure content is a string
    if not isinstance(content, str):
        content = str(content)
    # Set chunk size to 500 char, 50 char overlap    
    chunks = custom_text_splitter(content, 500, 50)
    for i, chunk in enumerate(chunks):
        if i == 0:
            # Concatenate title with the first chunk
            # This is to improve data retrieval accuracy
            chunk = title + "\n\n" + chunk
        split_data.append({'title': title, 'content': chunk})

# Create a new DataFrame with the split content
split_df = pd.DataFrame(split_data)

# Display the split DataFrame
print(split_df.head())

docs = [Document(page_content=row['content'], metadata={'title': row['title']}) for index, row in split_df.iterrows()]

                               title  \
0  Cara Cancel dan Amend Order Saham   
1  Cara Cancel dan Amend Order Saham   
2  Cara Cancel dan Amend Order Saham   
3  Cara Cancel dan Amend Order Saham   
4  Cara Cancel dan Amend Order Saham   

                                             content  
0  Cara Cancel dan Amend Order Saham\n\nSebelum m...  
1  (**Jual**) sahamnya.\n![](https://storage.cris...  
2  [](https://storage.crisp.chat/users/helpdesk/w...  
3  at/users/helpdesk/website/4538da5be8a77000/gam...  
4  kb.jpg)\n\n4. Kamu bisa ubah nilai harga atau ...  


### 3. Initialize LLM (Llama3-70b)

In [4]:
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import time 
import os

# Load the API key from the environment
load_dotenv()

# Load the API key from the environment
api_key = os.environ["GROQ_API_KEY"]

# Initialize the LLM model
string_prompt = PromptTemplate(
    template= """
    You are financial professional AI. Your objective is to answer frequently asked questions just like how a customer service person would. 
    Use casual and daily conversational language. Please use Bahasa Indonesia when responding to user's questions.
    
    \n\n {question} \n\n
""",
    input_variables = ["question"]
)

llama3_70b_groq = ChatGroq(temperature=0, model_name="llama3-70b-8192")

# Chain the data generator
data_generator = string_prompt | llama3_70b_groq |  StrOutputParser()

In [6]:
# Test the model 
question = "Apa itu saham? Apa perbedaan berinvestasi saham dan berinvestasi di Reksa Dana Saham?"
print("Llama 70B")
start_time = time.time()
print(data_generator.invoke({"question": question}))
print(f"inference took {time.time() - start_time} seconds")

Llama 70B
Halo!

Saham adalah salah satu instrumen investasi yang memungkinkan Anda memiliki bagian kepemilikan dari suatu perusahaan. Ketika Anda membeli saham, Anda menjadi salah satu pemilik perusahaan tersebut dan berhak atas sebagian keuntungan yang dihasilkan oleh perusahaan.

Sekarang, mari kita bahas perbedaan antara berinvestasi saham dan berinvestasi di Reksa Dana Saham.

Berinvestasi saham berarti Anda membeli saham langsung dari perusahaan atau melalui bursa efek. Anda memiliki kontrol penuh atas portofolio Anda dan dapat memilih saham mana yang ingin Anda beli atau jual. Namun, Anda juga harus memiliki pengetahuan dan waktu untuk memantau kinerja saham Anda.

Di sisi lain, berinvestasi di Reksa Dana Saham berarti Anda mempercayakan uang Anda kepada manajer investasi yang akan mengelola portofolio saham untuk Anda. Reksa Dana Saham adalah sebuah wadah yang menghimpun dana dari banyak investor untuk diinvestasikan dalam berbagai saham. Manajer investasi akan memilih saham ya

### 4. Embedding the Data and RAG

In [8]:
import duckdb
from langchain_community.vectorstores import DuckDB
from langchain_huggingface import HuggingFaceEmbeddings

db_name = "mvp_guan"
model_name = "intfloat/multilingual-e5-base"

# db_log/<name it however you like>
conn = duckdb.connect(f'db_log/demo_07162024_Guan_RAG_Testing'+str(db_name))

# Should print "name varchar" and "embeddings"
conn.sql("SHOW TABLES").show()
embeddings = HuggingFaceEmbeddings(model_name=model_name)

start_time = time.time()
docsearch = DuckDB(connection=conn,embedding=embeddings)
print("DuckDB initiation took", time.time() - start_time, "seconds to run")

┌────────────┐
│    name    │
│  varchar   │
├────────────┤
│ embeddings │
└────────────┘

DuckDB initiation took 0.0005710124969482422 seconds to run


In [9]:
start_time = time.time()
# Truncate any leftover embeddings to remove duplicates
conn.sql("TRUNCATE embeddings")
docsearch.add_documents(docs)

# Will take some time to seed the documents depending on the size of the data
print("Seeding documents took", time.time() - start_time, "seconds to run")

Seeding documents took 43.44543695449829 seconds to run


In [10]:
# 'k' refer to the amount of chunks to retrieve from the database
# 'score_threshold' refer to the minimum score to return the result (-1 to 1, 1 means exact match)
duckdb_retriever = docsearch.as_retriever(search_kwargs={'k': 5, 'score_threshold':0.7})

question = "Apa itu saham? Apa perbedaan berinvestasi saham dan berinvestasi di Reksa Dana Saham?"

# This should print the top 5 most similar chunks from the database
# ranked by their similarity score
duckdb_retriever.invoke(question)

[Document(metadata={'title': 'Apa itu Saham?', '_similarity_score': 0.9069331575727212}, page_content='am dan berinvestasi di Reksa Dana Saham?\n\nPerbedaan utama antara instrumen investasi saham dan reksadana saham terletak pada pihak yang mengambil keputusan investasi.\nPada investasi saham, keputusan untuk membeli dan menjual saham diputuskan secara mandiri oleh investor itu sendiri. Sedangkan pada reksadana saham, keputusan tersebut berada di tangan Manajer Investasi selaku pihak yang menerbitkan dan mengelola reksadana.\nBerinvestasi di saham langsung mengharuskan kamu untuk melakukan analisa da'),
 Document(metadata={'title': 'Apa itu Saham?', '_similarity_score': 0.8998171653470749}, page_content='emilik perusahaan sesuai dengan porsi berapa persen atau berapa banyak penyertaan yang ditanamkan di perusahaan tersebut. Jika perusahaan tersebut sehat, maka saham akan punya nilai jual yang tinggi karena bisa menghasilkan laba yang besar. Di sinilah daya tarik investasi dalam bentuk 

### 5. Giving Tones/Personality to the LLM

In [11]:
# I made 3 tones: comforting, detailed, and cautious
# Comforting tone if user's question is about worry, stress, or sadness
# Detailed tone if user's question is about how to do something
# Cautious tone if user's question is about potential risks or dangers
# These are in Bahasa Indonesia, change it to your own language if needed
comforting_keywords = [
    "khawatir", "kekhawatiran", "hutang", "pengembalian", "hilang",
    "bantuan", "ketakutan", "stres", "masalah", "isu", 
    "kesulitan", "bingung", "tidak bahagia", "kecewa", 
    "frustrasi", "cemas", "sedih", "takut", 
    "takut", "gugup"
]

detailed_keywords = [
    "bagaimana", "mengubah", "dimana", "instruksi", "langkah",
    "prosedur", "metode", "proses", "panduan", "detail",
    "menjelaskan", "klarifikasi", "info", "informasi", 
    "manual", "tutorial", "arah", "pengaturan", 
    "konfigurasi", "menyesuaikan"
]

cautious_keywords = [
    "berbahaya", "resiko", "risiko", "maksimum", "batas", "siswa sma",
    "peringatan", "kehati-hatian", "ancaman", "bahaya", "tidak aman",
    "problematik", "bahaya", "kewajiban", "paparan", "maximum"
    "konsekuensi", "keprihatinan", "bahaya", "alarm", 
    "waspada", "potensial"
]

In [12]:
# Define prompt templates for different tones
comforting_prompt = PromptTemplate(
    template= """
    You are an Indonesian financial professional AI. Use Bahasa Indonesia.
    Your objective is to answer the user's question in a comforting and empathetic manner. 
    Use casual and daily conversational language to comfort the user.
    Answer the question like you already know the information, not the reading from a source. 
    
    This is the user's question: 
    {question}
    
    These are the document and information available:
    {information}
    
    If the document is relevant to answer the user's question, 
    don't cite or look for sources online. 
    Stick to the information provided in the document.
    Don't cite or say anything about article provided in the document.
    
    If you feel the document is irrelevant to answer user question, 
    say something similar like "Kami tidak menemukan informasi tersebut dari database kami."
    Then use websearch to answer user question and still answer in Bahasa Indonesia.
    Don't forget to cite your sources (prefereably sources originating from Indonesia
    so the user knows that your response is credible using Bahasa Indonesia.
""",
    input_variables = ["question", "information"]
)

detailed_prompt = PromptTemplate(
    template= """
    You are an Indonesian financial professional AI. Use Bahasa Indonesia.
    Your objective is to answer the user's question in a detailed and informative manner. 
    Provide clear, step-by-step instructions to help the user solve their problem.
    Answer the question like you already know the information, not the reading from a source. 

    This is the user's question: 
    {question}
    
    These are the document and information available:
    {information}
    
    If the document is relevant to answer the user's question, 
    don't cite or look for sources online. 
    Stick to the information provided in the document.
    Don't cite or say anything about article provided in the document.
    
    If you feel the document is irrelevant to answer user question, 
    say something similar like "Kami tidak menemukan informasi tersebut dari database kami."
    Then use websearch to answer user question and still answer in Bahasa Indonesia.
    Don't forget to cite your sources (prefereably sources originating from Indonesia
    so the user knows that your response is credible using Bahasa Indonesia.
""",
    input_variables = ["question", "information"]
)

cautious_prompt = PromptTemplate(
    template= """
    You are an Indonesian financial professional AI. Use Bahasa Indonesia.
    Your objective is to answer the user's question in a cautious and warning manner. 
    Alert the user to any potential risks and provide advice to help them 
    avoid dangerous situations.
    Answer the question like you already know the information, not the reading from a source. 

    
    This is the user's question: 
    {question}
    
    These are the document and information available:
    {information}
    
    If the document is relevant to answer the user's question, 
    don't cite or look for sources online. 
    Stick to the information provided in the document.
    Don't cite or say anything about article provided in the document.
    
    If you feel the document is irrelevant to answer user question, 
    say something similar like "Kami tidak menemukan informasi tersebut dari database kami."
    Then use websearch to answer user question and still answer in Bahasa Indonesia.
    Don't forget to cite your sources (prefereably sources originating from Indonesia
    so the user knows that your response is credible using Bahasa Indonesia.
""",
    input_variables = ["question", "information"]
)

In [13]:
# Prompt to categorize which tone to use
tone_prompt = PromptTemplate(
    template= """
    Please select whether this user's prompt, {question}, should be answered in a detailed,
    comforting, or cautious tone. Please only respond 0 for detailed tone, 1 for comforting tone, 
    and 2 for cautious tone without any other words or characters or white spaces. 
    
    A comforting tone should be used if user's question feels 
    like the user is worried or emotionally stressed about a certain concern. 
    This can be like "What happened if I have a debt on my account?" 
    or "How can I return a stock order". The purpose is to comfort the user. 
    
    A detailed tone should be used if the user is asking about a technical problem 
    such as "How change my email address or phone number?", 
    "Where to see my available stocks?". The purpose is to give a neat, 
    detailed response of instructions to help the user solve the a technical issue.
    
    A cautious tone should be used if the user is asking about a problem that
    might be dangerous or pose a threat to the user. 
    Such as "Can a highschooler use bibit?", 
    "What is the maximum amount of stocks i can buy from this company." 
    The purposeis to warn user that their question and actions might 
    lead to debt because of taking high risks. 
    
    Watch out for this key words if present in the question to identify which tone to use.
    
    comforting_keywords = [
        "khawatir", "kekhawatiran", "hutang", "pengembalian", "hilang",
        "bantuan", "ketakutan", "stres", "masalah", "isu", 
        "kesulitan", "bingung", "tidak bahagia", "kecewa", 
        "frustrasi", "cemas", "sedih", "takut", 
        "takut", "gugup"
    ]

    detailed_keywords = [
        "bagaimana", "mengubah", "dimana", "instruksi", "langkah",
        "prosedur", "metode", "proses", "panduan", "detail",
        "menjelaskan", "klarifikasi", "info", "informasi", 
        "manual", "tutorial", "arah", "pengaturan", 
        "konfigurasi", "menyesuaikan"
    ]

    cautious_keywords = [
        "berbahaya", "resiko", "risiko", "maksimum", "batas", "siswa sma",
        "peringatan", "kehati-hatian", "ancaman", "bahaya", "tidak aman",
        "problematik", "bahaya", "kewajiban", "paparan", "maximum"
        "konsekuensi", "keprihatinan", "bahaya", "alarm", 
        "waspada", "potensial"
    ]
    
""",
    input_variables = ["question"]
)

In [15]:
# Define the chains for each tone
# Tone chain will return an integer value of 0, 1, or 2 
# to determine which tone to use for the response
tone_chain = tone_prompt | llama3_70b_groq | StrOutputParser()
detailed_chain = detailed_prompt | llama3_70b_groq | StrOutputParser()
comforting_chain = comforting_prompt | llama3_70b_groq | StrOutputParser()
cautious_chain = cautious_prompt | llama3_70b_groq | StrOutputParser()

# Function to generate response based on tone
def generate_response(question, information):
    start_time = time.time()
    
    tone_result = tone_chain.invoke({"question": question})
    tone = int(tone_result.strip())
    
    if tone == 0:
        response = detailed_chain.invoke({"question": question, "information": information})
    elif tone == 1:
        response = comforting_chain.invoke({"question": question, "information": information})
    elif tone == 2:
        response = cautious_chain.invoke({"question": question, "information": information})
    else:
        # default tone: detailed
        response = detailed_chain.invoke({"question": question, "information": information})  # Default to detailed
    
    end_time = time.time()
    print(f"Generating response took {end_time - start_time:.2f} seconds \n")
    
    return response

In [16]:
# Change the question to test the Stockbit FAQ bot
question = "Bagaimana Jika Bibit Ditutup?"
information = duckdb_retriever.invoke(question)

print(generate_response(question, information))

Generating response took 1.90 seconds 

Bagaimana jika Bibit ditutup? 

Jika Bibit ditutup, investor tidak perlu khawatir mengenai uang yang sudah diinvestasikan pada produk reksa dana. Seluruh dana investor tidak disimpan di Bibit, melainkan tersimpan aman di Bank Kustodian. Artinya, jika Bibit sampai tutup sekalipun, uang dan reksa dana kamu tetap tersimpan aman di Bank Kustodian.

Untuk mengambil dana, kamu dapat melakukan proses pencairan dana melalui Manajer Investasi. Manajer Investasi akan memandu proses pencairan dana dengan menunjukkan KTP serta bukti kepemilikan reksa dana.

Jadi, investor tidak perlu khawatir jika Bibit ditutup, karena dana kamu tetap aman dan dapat dicairkan kapan saja.
