In [1]:
# import os, json
# from langchain.chains import RetrievalQA
# from langchain_openai import ChatOpenAI
# from langchain_community.document_loaders import UnstructuredPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings

In [2]:
# !pip install langchain langchain-openai langchain-community faiss-cpu unstructured

In [3]:
# !pip install langchain

In [4]:
import os, json
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import faiss
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [5]:
with open('config.json') as f:
    config = json.load(f)

OPENAI_API_KEY = config['OPENAI_API_KEY']
DATA_DIR = config['DATA_DIR']
VECTOR_STORE_PATH = config['VECTORS_DIR']

In [6]:
def load_and_split_pdfs(data_dir):
    all_docs = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            loader = UnstructuredPDFLoader(os.path.join(data_dir, filename))
            docs = loader.load()
            all_docs.extend(docs)

    if not all_docs:
        raise ValueError("No documents found in the directory.")

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(all_docs)
    return chunks

chunks = load_and_split_pdfs(DATA_DIR)
print(f"Loaded and splitted {len(chunks)} documents.")

  from .autonotebook import tqdm as notebook_tqdm


Loaded and splitted 93 documents.


In [27]:
# !pip install pdfminer.six
# !pip install pi-heif

chunks[4]


Document(page_content='67.20\n\nWWW.AMAZON.CA ON\n\n006\n\nAug 27 Aug 28 AMZN Mktp CA*R40RY19Y0\n\n6.69\n\nWWW.AMAZON.CA ON\n\n007\n\nAug 27 Aug 28 AMZN Mktp CA*R42UX4IE0\n\n11.14\n\nWWW.AMAZON.CA ON\n\n008\n\nAug 27 Aug 28 AMZN Mktp CA*R45253IH0\n\n26.87\n\n009\n\n010\n\n011\n\n012\n\n013\n\nSep 1\n\nSep 3\n\nSep 5\n\nSep 5\n\nSep 5\n\nSep 3\n\nSep 4\n\nSep 6\n\nSep 6\n\nSep 6\n\nWWW.AMAZON.CA ON IC* INSTACART HALIFAX MID-H NS PAY) AMZN Mktp CA*RK0O06YH0 WWW.AMAZON.CA ON STAPLES STORE #239 VANCOUVER BC (APPLE PAY) STAPLES STORE #239 VANCOUVER BC (APPLE PAY) STAPLES STORE #239 VANCOUVER BC (APPLE PAY)\n\n(APPLE\n\n35.47\n\n48.15\n\n0.63\n\n1.26\n\n16.90\n\nSUB-TOTAL CREDITS - 4537 001 770 517 011 SUB-TOTAL DEBITS - 4537 001 770 517 011\n\n$0.00 $398.73\n\nInterest charges posted on statement date\n\nCash advances/cheques\n\n$0.00\n\nSpecial rate offers\n\n$0.00\n\nPurchases\n\n$0.00\n\nStatement Period Statement date Account # Page\n\n- Sep 6, 2024 Sep 6, 2024 4537 001 770 517 011 1 of

In [8]:
def create_save_vectorstore(chunks, vectorestore_dir):
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectorestore = FAISS.from_documents(documents=chunks, embedding=embeddings)
    vectorestore.save_local(vectorestore_dir)
    print(f"Vectorstore saved to {vectorestore_dir}")

create_save_vectorstore(chunks, VECTOR_STORE_PATH)

Vectorstore saved to vectorstore


In [9]:
def load_vectorstore(vectorstore_dir):
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    return FAISS.load_local(
        folder_path=vectorstore_dir,
        embeddings=embeddings,
        allow_dangerous_deserialization=True,
    )

In [10]:
def ask_rag(question):
    vectorstore = load_vectorstore(VECTOR_STORE_PATH)
    retriever = vectorstore.as_retriever()
    llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa.invoke(question)

In [32]:
ask_rag("Can you tell me how much I spent in total on groceries last week? plus the date I made this transaction?")

{'query': 'Can you tell me how much I spent in total on groceries last week? plus the date I made this transaction?',
 'result': "I'm sorry, but I don't have the specific information about your grocery transactions from last week in the provided context. The transactions listed do not specify grocery purchases. You may need to review your statement or transaction history to find the total amount spent on groceries and the date of the transaction."}

In [12]:
# !pip uninstall faiss-cpu faiss-gpu