In [23]:
import os
from dotenv import load_dotenv
import zipfile

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
from dotenv import load_dotenv
import pickle

load_dotenv()

True

In [12]:
def load_chunk_persist_pdf() -> Chroma:
    pdf_folder_path = "./pdfs/"
    documents = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_documents = text_splitter.split_documents(documents)
    client = chromadb.Client()
    if client.list_collections():
        consent_collection = client.create_collection("consent_collection")
    else:
        print("Collection already exists")
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=OpenAIEmbeddings(),
        persist_directory="./chroma_store"
    )
    vectordb.persist()
    return vectordb

In [8]:
def create_agent_chain():
    model_name = "gpt-3.5-turbo"
    llm = ChatOpenAI(model_name=model_name)
    chain = load_qa_chain(llm, chain_type="stuff")
    return chain

In [18]:
def get_llm_response(query, vectordb):
    chain = create_agent_chain()
    matching_docs = vectordb.similarity_search(query)
    answer = chain.run(input_documents=matching_docs, question=query)
    return answer

In [24]:
def zip_folder(folder_path, output_path):
    """
    Compresses a folder into a ZIP file.

    Parameters:
    - folder_path: The path to the folder that should be compressed.
    - output_path: The path where the output ZIP file should be saved.
    """
    # Create a ZIP file for writing compressed data
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a relative path for files to maintain the directory structure
                rel_path = os.path.relpath(os.path.join(root, file), os.path.dirname(folder_path))
                zipf.write(os.path.join(root, file), arcname=rel_path)

In [25]:
zip_folder("./chroma_store", "chroma_store.zip")

In [16]:
res = load_chunk_persist_pdf() 

Collection already exists


In [21]:
with open('vector_db.pkl', 'wb') as f:
    pickle.dump(res, f)

TypeError: cannot pickle 'sqlite3.Connection' object

In [19]:
get_llm_response("What is the purpose of this study?", res)


'The purpose of the study is to promote and strengthen measures to prevent and combat corruption more efficiently and effectively, promote international cooperation and technical assistance in the prevention of and fight against corruption, including asset recovery, and to promote integrity, accountability, and proper management of public affairs and public property.'