<src img='https://raw.githubusercontent.com/Evogelpohl/linkArtifacts/main/pdf_openai.png'>

<img src='https://raw.githubusercontent.com/Evogelpohl/linkArtifacts/main/pdf_openai_1.png'>



## Install packages

In [None]:
pip install -q pdf2image pytesseract reportlab pinecone-client

## Import libraries

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Load our data (the process of OCR)

In [None]:
# Let's use one of the Content Loaders from Langchain to read our OCR'd text file
loader = TextLoader("./text_output/cleaned_file.txt")

In [None]:
# Let's see some details about our input text document
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

In [None]:
# The original doc has **far** too many chartacters to send to our LLM
# So, we break down the doc into multiple documents. 
# Experiment with the chunk_size accordingly. 


text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f'Now you have {len(texts)} documents that will be sent to the \n'
       f'LLM when needed to fulfill the answer to a question'
       )

## Create the embeddings of our documents

In [None]:
import os
from getpass import getpass

# We need to get the OpenAI or Azure OpenAI API key. This is how we use & get charged for LLM usage
if "OPENAI_API_KEY" in os.environ:
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
else:
    OPENAI_API_KEY = getpass("Enter your OpenAI API Key: ")
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# Pinecone is a service that will take the documents that you split
# And store the embedding vectors (a math construct that tells LLMs where in the model to find similar words)
# Currently, Pinecone is free for use cases like this. Other vector stores exist; FAISS, ChromaDB, etc.

PINECONE_API_ENV = "us-east4-gcp"

try:
    PINECONE_API_KEY
except NameError:
    PINECONE_API_KEY = getpass("Enter your Pinecone API Key: ")

In [None]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

# Let's create the embeddings (vector math pointers of our docs) using OpenAI's Embeddings Creator model
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = "ds-after-action01"

# Let's send our embeddings to Pinecone for temp storage and usage
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
query = "Summarize the issue with the SEABEA uniform color?"
docs = docsearch.similarity_search(query, include_metadata=True)

num_docs = len(docs)
print(f'There are {num_docs} documents out of the {len(texts)} produced, or split \n'
      f'from the original doc that are relevant (similar) to your search term')

## Let's set up our connection to the LLM so we can ask it questions.

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

# Set up the connection to the OpenAI LLM, with parameters that control its behavior
llm = OpenAI(
    temperature=0, # control the degree of creative or realism for the model (0-1)
    openai_api_key=OPENAI_API_KEY, # our key to OpenAI or Azure's OpenAI LLM
    max_tokens=-1 # the number of tokens to return, -1 == max
    )

# Setup the 'chain' of documents that are "stuffed" (literally) into the LLM as question-time.
chain = load_qa_chain(llm, chain_type="stuff") # there are other types of chain_type. Experiment with map_reduce.

## Let's use our connection to the LLM and ask it questions.

In [None]:
# Define our question or query
query = "Brief me on the issues related to SEABEE uniform color? Cite the doc sources."

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)


In [None]:
# Define our question or query
query = "Extract the key individual's names and ranks from the doc. Include LTG, VADM, CAPT, FOCM ranks"

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)

In [None]:
# Define our question or query
query = "How many Seabee reservists were serving?"

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)

In [None]:
# Define our question or query
query = "Regarding NAVAL CONSTRUCTION FORCE (NCF) CAPABILITY category, summarize each problem/issues/lesson. Cite your source"

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)