- user is able to upload a pdf file 
- then the model is able to provide search results via Q&A format

In [17]:
# module to load pdf
from langchain.document_loaders import PyPDFLoader
# word embedding and then save as vectorDB
from langchain.indexes.vectorstore import VectorstoreIndexCreator, VectorStoreIndexWrapper
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

from dotenv import load_dotenv
import os

# conda install fastapi
# pip install "uvicorn[standard]"

load_dotenv()

file_path = "./JobShopSchedullingArticle.pdf"
local_persist_path = "./vector_store"

def get_index_path(index_name):
    return os.path.join(local_persist_path, index_name)

def load_pdf_and_save_to_index(file_path, index_name):

    loader = PyPDFLoader(file_path)
    
    # by default, VectorstoreIndexCreator uses openAI API
    index = VectorstoreIndexCreator(
        vectorstore_kwargs={'persist_directory':get_index_path(index_name)}
        ).from_loaders([loader])

    # save the vectorDB to avoid regenerating vectors next time
    index.vectorstore.persist()

# load the exported vectorDB
def load_index(index_name):
    index_path = get_index_path(index_name)
    # you need to use the same embedding API as in VectorstoreIndexCreators
    embedding = OpenAIEmbeddings()
    vectordb = Chroma(
        persist_directory=index_path,
        embedding_function=embedding
    )
    return VectorStoreIndexWrapper(vectorstore=vectordb)


def query_index(index, query):
    ans = index.query_with_sources(
    query, 
    chain_type="map_reduce"
    )
    return ans['answer']

In [10]:
# export to local
load_pdf_and_save_to_index(file_path=file_path, index_name='test1')
load_pdf_and_save_to_index(file_path=file_path, index_name='test2')

In [14]:
index_test2 = load_index(index_name='test2')

In [15]:
index_test2.query("who is the first author?")

' C. Özgüven'

In [16]:
# note the above method is not working when the text input is too large
# map reduce would only pass a small segement a time then combine at later stage
index_test2.query_with_sources(
    "what the article is about?", 
    chain_type="map_reduce"
    )

{'question': 'what the article is about?',
 'answer': ' The article is about solving the Flexible Job Shop Scheduling Problem (FJSP) using different approaches such as integer programming, heuristics, and simulated annealing. It also discusses extensions to the JSP, including routing flexibility and process plan flexibility. The article was published in the journal Applied Mathematical Modelling in 2010.\n',
 'sources': './JobShopSchedullingArticle.pdf'}

In [18]:
query_index(index=index_test2, query='what the article is about?')

" The article is about applied mathematical modelling in the context of solving the Flexible Job Shop Scheduling Problem (FJSP) using different heuristics and mathematical models. It also discusses two extensions to the JSP: routing flexibility and process plan flexibility. The article is based on a dissertation from Erciyes University's Social Sciences Institute in Kayseri, Turkey in 2004, written by C. Özgüven et al. and published in the journal Applied Mathematical Modelling in 2010.\n"