In [13]:
from langchain_community.document_loaders import PyPDFLoader 

# PDF Based Document Loader
pdf_loader = PyPDFLoader(r'docs/key-fact-statement.pdf')
docs = pdf_loader.load()
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, HTMLHeaderTextSplitter, RecursiveJsonSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50 )
# divide document into 500 text size and overalap of 50 characters

final_docs = text_splitter.split_documents(docs)

In [14]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [16]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(final_docs, embeddings)

In [17]:
query = "APR for Visa Gold"
retrieved_result = db.similarity_search(query)

In [18]:
retrieved_result

[Document(metadata={'author': 'jason.c.f.chow@hsbc.com.hk', 'creationdate': '2024-10-18T14:40:05+08:00', 'creator': 'Microsoft® Word for Microsoft 365', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'page': 3, 'page_label': '4', 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'docs/key-fact-statement.pdf', 'subject': 'Key Fact Statement', 'title': 'Key Fact Statement', 'total_pages': 9}, page_content='Pract

In [19]:
# FAISS DB
# Facebook AI Similarity Search. Lib to efficiently perform similarity search and clustering of dense vectors. It contains algo that searches dense of any size even if it doesn't fit in the RAM. Also has supporting code for eval and param tuning

In [20]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [21]:
loader = PyPDFLoader(file_path = r'docs/key-fact-statement.pdf')
docs = loader.load()

In [22]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)

In [23]:
final_docs = splitter.split_documents(docs)

In [24]:
embeddings = OllamaEmbeddings(model='gemma:2b', base_url="http://127.0.0.1:11434")
db = FAISS.from_documents(final_docs, embeddings)

In [25]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x70064c0ba8a0>

In [26]:
query = "APR"

In [27]:
results = db.similarity_search(query)

In [28]:
print(results[0].page_content)

PUBLIC 
The Hongkong and Shanghai Banking Corporation Limited (“we”, “us” or “our”)  
KEY FACTS STATEMENT FOR CREDIT CARDS 
You are advised to refer to the “Bank tariff guide for HSBC Wealth and Personal Banking Customers” and the Credit Card Terms for 
your credit card for more details.  
Credit Cards 
November 2024 
 
Interest Rates and Finance Charges  
Annualised   
Percentage Rate   
(APR) for  
Purchase1  
35.42% when you open your account and it will be reviewed from time to time.


In [29]:
# Retrievers --> Vector stores can be converted to Retriever class. This allows us to use it in other langchain methods.

In [30]:
retriever = db.as_retriever()

In [31]:
retriever.invoke(query)

[Document(id='bbcfc3d9-e8f3-4529-9e9b-34b7666e18f3', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'title': 'Key Fact Statement', 'author': 'jason.c.f.chow@hsbc.com.hk', 'subject': 'Key Fact Statement', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'source': 'docs/key-fact-statement.pdf', 'total_pages': 9, 'page':

In [32]:
# FIASS methods of Similarity Search with Score
# It returns documents + L2 distance (Manhattan Distance)
# Lower score is better

In [33]:
docs_w_score = db.similarity_search_with_score(query)

In [34]:
docs_w_score

[(Document(id='bbcfc3d9-e8f3-4529-9e9b-34b7666e18f3', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'title': 'Key Fact Statement', 'author': 'jason.c.f.chow@hsbc.com.hk', 'subject': 'Key Fact Statement', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'source': 'docs/key-fact-statement.pdf', 'total_pages': 9, 'page'

In [35]:
# Pass vector to query instead of passing text
embed_query = embeddings.embed_query(query)

In [36]:
db.similarity_search_by_vector(embed_query)

[Document(id='bbcfc3d9-e8f3-4529-9e9b-34b7666e18f3', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'title': 'Key Fact Statement', 'author': 'jason.c.f.chow@hsbc.com.hk', 'subject': 'Key Fact Statement', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'source': 'docs/key-fact-statement.pdf', 'total_pages': 9, 'page':

In [37]:
#Save Vector store DB
db.save_local("FIASS_Index")

In [38]:
# Load Vector DB
new_db = FAISS.load_local("FIASS_Index", embeddings, allow_dangerous_deserialization=True)

In [40]:
new_db.similarity_search(query)

[Document(id='bbcfc3d9-e8f3-4529-9e9b-34b7666e18f3', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'title': 'Key Fact Statement', 'author': 'jason.c.f.chow@hsbc.com.hk', 'subject': 'Key Fact Statement', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'source': 'docs/key-fact-statement.pdf', 'total_pages': 9, 'page':

In [41]:
# Chroma DB is AI Native Vector Store Database

In [1]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
pdf_loader = PyPDFLoader(file_path='docs/key-fact-statement.pdf')

In [3]:
docs = pdf_loader.load()

In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)

In [5]:
final_docs = splitter.split_documents(docs)

In [6]:
embeddings = OllamaEmbeddings(model='gemma:2b', base_url="http://127.0.0.1:11434")


In [7]:
db = Chroma.from_documents(final_docs, embeddings)

In [9]:
db.similarity_search("Gold card APR")

[Document(id='f79dbc8b-4908-4950-b05d-ec08bc540e9b', metadata={'author': 'jason.c.f.chow@hsbc.com.hk', 'creationdate': '2024-10-18T14:40:05+08:00', 'creator': 'Microsoft® Word for Microsoft 365', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'page': 1, 'page_label': '2', 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'docs/key-fact-statement.pdf', 'subject': 'Key Fact Statement', 'title': 'Key Fact Stateme

In [10]:
vectordb = Chroma.from_documents(documents=final_docs, embedding=embeddings, persist_directory='./chroma_db')

In [13]:
new_db = Chroma(persist_directory='./chroma_db', embedding_function = embeddings)

In [14]:
new_db.similarity_search("Gold card APR")

[Document(id='1b241775-951a-4ba2-aa70-07817898dae1', metadata={'author': 'jason.c.f.chow@hsbc.com.hk', 'creationdate': '2024-10-18T14:40:05+08:00', 'creator': 'Microsoft® Word for Microsoft 365', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'page': 1, 'page_label': '2', 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'docs/key-fact-statement.pdf', 'subject': 'Key Fact Statement', 'title': 'Key Fact Stateme

In [15]:
# Retriever
retriever = vectordb.as_retriever()

In [16]:
retriever.invoke("Gold Card APR")

[Document(id='1b241775-951a-4ba2-aa70-07817898dae1', metadata={'author': 'jason.c.f.chow@hsbc.com.hk', 'creationdate': '2024-10-18T14:40:05+08:00', 'creator': 'Microsoft® Word for Microsoft 365', 'keywords': 'RESTRICTED -', 'moddate': '2024-10-18T14:40:05+08:00', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_actionid': '8c75c534-6cc7-4fbb-8718-2dcd08b8899b', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_contentbits': '2', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_enabled': 'true', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_method': 'Privileged', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_name': 'CLAPUBLIC', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_setdate': '2024-10-08T15:56:53Z', 'msip_label_3486a02c-2dfb-4efe-823f-aa2d1f0e6ab7_siteid': 'e0fd434d-ba64-497b-90d2-859c472e1a92', 'page': 1, 'page_label': '2', 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'docs/key-fact-statement.pdf', 'subject': 'Key Fact Statement', 'title': 'Key Fact Stateme