<src img='https://raw.githubusercontent.com/Evogelpohl/linkArtifacts/main/pdf_openai.png'>

<img src='https://raw.githubusercontent.com/Evogelpohl/linkArtifacts/main/pdf_openai_2.png'>



## Install packages

In [None]:
pip install -q pdf2image pytesseract reportlab pinecone-client Pillow

## Import libraries

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
import platform
from pathlib import Path

import pytesseract
from PIL import Image, ImageOps
from pdf2image import convert_from_path

In [11]:
#Lets define some key variable for the PDF OCR'ing process
#Configure paths and variables

TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe" #tesseract must be installed separately
POPPLER_PATH = r"C:\Program Files\poppler-23.01.0\Library\bin" #poppler must be installed separately

LLM_DIRECTORY = Path(r"C:\Temp\pdf_to_openai_chat")
LLM_DIRECTORY.mkdir(parents=True, exist_ok=True)

SRC_PDFS_DIRECTORY = LLM_DIRECTORY / "src_pdfs"

PAGES_DIRECTORY = LLM_DIRECTORY / "pages"
PAGES_DIRECTORY.mkdir(parents=True, exist_ok=True)

TEXT_OUTPUT_DIRECTORY = LLM_DIRECTORY / "text_output"
TEXT_OUTPUT_DIRECTORY.mkdir(parents=True, exist_ok=True)

## PDF image OCR process, saving the results to a text file

In [None]:
if platform.system() == "Windows":
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH

def create_page_images_pdf2image(pdf_path, output_directory, dpi=300):
    """
    Convert a PDF file into a list of JPEG images, one for each page.
    
    :param pdf_path: Path to the input PDF file.
    :param output_directory: Directory to save the generated JPEG images.
    :param dpi: DPI for the generated images.
    :return: A list of file paths for the generated images.
    """
    image_file_list = []
    convert_args = {"pdf_path": pdf_path, "dpi": dpi}

    if platform.system() == "Windows":
        convert_args["poppler_path"] = POPPLER_PATH

    pdf_pages = convert_from_path(**convert_args)

    for page_number, page in enumerate(pdf_pages, start=1):
        output_file = output_directory / f"page_{page_number:03}.jpg"
        page.save(output_file, "JPEG")
        image_file_list.append(output_file)
        print(f"Image created: {output_file}")

    return image_file_list


def convert_to_bw(image_list):
    """
    Convert a list of images to black and white.
    
    :param image_list: List of input image file paths.
    """
    for image_file in image_list:
        image = Image.open(image_file)
        gray_image = ImageOps.grayscale(image)
        bw_image = gray_image.point(lambda x: 0 if x < 128 else 255, '1')
        bw_image.save(image_file)


def ocr_images(image_list, text_output_path):
    """
    Perform OCR on a list of images and append the extracted text to a file.
    
    :param image_list: List of input image file paths.
    :param text_output_path: File path to save the extracted text.
    """
    with open(text_output_path, "a") as output_file:
        for image_file in image_list:
            text = str(((pytesseract.image_to_string(Image.open(image_file)))))
            text = text.replace("-\n", "")
            output_file.write(text)
            print(f"OCR processed: {image_file}")


if __name__ == "__main__":
    pdf_files = SRC_PDFS_DIRECTORY.glob("*.pdf")

    for pdf_file in pdf_files:
        pdf_name = pdf_file.stem
        pages_subdir = PAGES_DIRECTORY / pdf_name
        pages_subdir.mkdir(parents=True, exist_ok=True)
        image_file_list = create_page_images_pdf2image(pdf_file, pages_subdir, dpi=300)
        convert_to_bw(image_file_list)
        text_output_file = TEXT_OUTPUT_DIRECTORY / f"{pdf_name}_text.txt"
  
        ocr_images(image_file_list, text_output_file)


## Load our data (the process of OCR)

In [12]:
# Let's use one of the Content Loaders from Langchain to read our OCR'd text file
loader = TextLoader(r"C:\Temp\pdf_to_openai_chat\text_output\Troubled_Passage_text.txt")

In [13]:
# Let's see some details about our input text document
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 1467780 characters in your document


In [14]:
# The original doc has **far** too many chartacters to send to our LLM
# So, we break down the doc into multiple documents. 
# Experiment with the chunk_size accordingly. 


text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f'Based on our chunk_size, we now have {len(texts)} documents that will be sent to the \n'
       f'LLM when needed to fulfill the answer to a question'
       )

Based on our chunk_size, we now have 650 documents that will be sent to the 
LLM when needed to fulfill the answer to a question


## Create the embeddings of our documents

In [15]:
import os
from getpass import getpass

# We need to get the OpenAI or Azure OpenAI API key. This is how we use & get charged for LLM usage
if "OPENAI_API_KEY" in os.environ:
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
else:
    OPENAI_API_KEY = getpass("Enter your OpenAI API Key: ")
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# Pinecone is a service that will take the documents that you split
# And store the embedding vectors (a math construct that tells LLMs where in the model to find similar words)
# Currently, Pinecone is free for use cases like this. Other vector stores exist; FAISS, ChromaDB, etc.

PINECONE_API_ENV = "us-east4-gcp"

try:
    PINECONE_API_KEY
except NameError:
    PINECONE_API_KEY = getpass("Enter your Pinecone API Key: ")

In [18]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

# Let's create the embeddings (vector math pointers of our docs) using OpenAI's Embeddings Creator model
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

# Create an index (a database) for our embeddings
index_name = 'faa-story-002'
pinecone.create_index(name="faa-story-002", dimension=1536, metric="cosine")

# Let's send our embeddings to Pinecone for temp storage and usage
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [19]:
# We'll create a sample query and use the similarity_search to determine what docs are relavent to our query
query = "Summarize the Author’s Preface"
docs = docsearch.similarity_search(query, include_metadata=True)

num_docs = len(docs)
print(f'There are {num_docs} documents out of the {len(texts)} produced, or split'
      f'from the original doc that are relevant (similar) to your search term')

There are 4 documents out of the 650 produced, or splitfrom the original doc that are relevant (similar) to your search term


## Let's set up our connection to the LLM so we can ask it questions.

In [20]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

# Set up the connection to the OpenAI LLM, with parameters that control its behavior
llm = OpenAI(
    temperature=0, # control the degree of creative or realism for the model (0-1)
    openai_api_key=OPENAI_API_KEY, # our key to OpenAI or Azure's OpenAI LLM
    max_tokens=-1 # the number of tokens to return, -1 == max
    )

# Setup the 'chain' of documents that are "stuffed" (literally) into the LLM as question-time.
chain = load_qa_chain(llm, chain_type="stuff") # there are other types of chain_type. Experiment with map_reduce.

## Let's use our connection to the LLM and ask it questions.

In [21]:
# Define our question or query
query = "Summarize the author's preface."

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)


' The author expresses his appreciation for the assistance he received from many individuals in researching and writing the book, including Nick A. Komons, Glyndon P. Bennett, Gerald E. Lavey, John G. Leyden, Billy E. Bays, Claude S. Brinegar, William T. Coleman, Jr., John H. Shaffer, Alexander P. Butterfield, John L. McLucas, James E. Dow, Charles O. Cary, John F. Leyden, Richard P. Hallion, William M. Leary, and Richard K. Smith. He also expresses his appreciation for the help of librarians and archivists at various institutions.'

In [22]:
# Define our question or query
query = "Extract key individual's names from this text"

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)

' Nick A. Komons, Glyndon P. Bennett, Gerald E. Lavey, John G. Leyden, Billy E. Bays, Claude S. Brinegar, William T. Coleman, Jr., John H. Shaffer, Alexander P. Butterfield, John L. McLucas, James E. Dow, and Oscar Bakke.'

In [23]:
# Define our question or query
query = "Tell me about Flight 514 to Midwest including how many passengers were killed."

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)

' Flight 514 was a Boeing 727 en route to the Capital from the Midwest. After consulting his dispatcher, the captain decided to change his destination to Dulles International. The aircraft struck the crest of a ridge near the town of Berryville, killing all aboard.'

In [24]:
# Define our question or query
query = "Summarize the hijacking incidents"

# Look in the embeddings store for documents (splits of the orig text) that are similar to your question
docs_to_search = docsearch.similarity_search(query, include_metadata=True)

# Send the matching docs & our question to the LLM. It will return the answer below.
chain.run(input_documents=docs_to_search, question=query)

' In the autumn of 1972, two violent hijacking incidents occurred. In October, four men wanted for murder and bank robbery killed a ticket agent and made their way to Havana aboard an Eastern 727. In November, three fugitives seized a Southern Airways DC-9 and embarked on a harrowing journey, making eight landings, collecting ransom, and threatening to crash the aircraft into a nuclear facility. The FBI eventually shot the tires of the DC-9, and the pilot managed to take off and land safely in Havana. These incidents broke the diplomatic impasse and led to a bilateral agreement between the United States and Cuba in February 1973, which included a commitment to return or prosecute hijackers.'

In [25]:
# Remove our Pinecone Index

pinecone.delete_index(name=index_name)