# Chat With Anything - From PDFs Files to Image Documents: 
Author: Zoumana KEITA   
https://medium.com/@zoumanakeita

In [None]:
import warnings
warnings.filterwarnings('ignore')

: 

### Install the requirements

In [None]:
%%bash

pip -q install langchain faiss-cpu unstructured
pip -q install openai tiktoken
pip -q install pytesseract pypdf

# Chat & Query your PDF files

## Detect Document Type

In [None]:

from filetype import guess

def detect_document_type(document_path):

    guess_file = guess(document_path)
    file_type = ""
    image_types = ['jpg', 'jpeg', 'png', 'gif']

    if(guess_file.extension.lower() == "pdf"):
        #print(guess_file.extension.lower())
        file_type = "pdf"

    elif(guess_file.extension.lower() in image_types):

        file_type = "image"

    else:
        file_type = "unkown"

    return file_type


In [None]:
guess("1148_Notas_2023.pdf").extension.lower() == "image"

In [None]:
research_paper_path = "Banco BPI RC 2023port.pdf"
article_information_path = "1148_Notas_2023.pdf"

print(f"Research Paper Type: {detect_document_type(research_paper_path)}")
print(f"Article Information Document Type: {detect_document_type(article_information_path)}")

## Extract Documents Content

In [None]:
!pip install unstructured rank_bm25 pdf2image pdfminer-six pikepdf pypdf unstructured_inference fastapi kaleido uvicorn "pillow<10.1.0" pillow_heif -q

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.image import partition_image
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders import UnstructuredFileLoader

"""
YOU CAN UNCOMMENT THE CODE BELOW TO UNDERSTAND THE LOGIC OF THE FUNCTIONS
"""
"""

def extract_text_from_pdf(pdf_file):

    loader = UnstructuredFileLoader(pdf_file)
    documents = loader.load()
    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)

    return pdf_pages_content
b
def extract_text_from_image(image_file):

    loader = UnstructuredImageLoader(image_file)
    documents = loader.load()

    image_content = '\n'.join(doc.page_content for doc in documents)

    return image_content
"""

def extract_file_content(file_path):

    file_type = detect_document_type(file_path)

    if(file_type == "pdf"):
        loader = UnstructuredFileLoader(file_path, mode="elements")

    elif(file_type == "image"):
        loader = UnstructuredImageLoader(file_path)

    documents = loader.load()
    documents_content = '\n'.join(doc.page_content for doc in documents)

    return documents_content


In [None]:
# Install other dependencies
# https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst
!brew install libmagic
!brew install poppler
!brew install tesseract
# If parsing xml / html documents:
!brew install libxml2
!brew install libxslt

import nltk
nltk.download('punkt')

In [None]:
from langchain_community.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("1148_Notas_2023.pdf",  mode="elements")

docs = loader.load()

docs

In [None]:
#research_paper_content = extract_text_from_pdf(research_paper_path)
#article_information_content = extract_text_from_image(article_information_path)


#research_paper_content = extract_file_content(research_paper_path)
article_information_content = extract_file_content(article_information_path)

In [None]:
nb_characters = 3000

#print(f"First {nb_characters} Characters of the Paper: \n{research_paper_content[:nb_characters]}...")
#print("---"*5)
print(f"First Characters of Article Information Document :\n {article_information_content}...")
#article_information_content


## Chat Implementation

### Create Chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [None]:
research_paper_chunks = text_splitter.split_text(research_paper_content)
article_information_chunks = text_splitter.split_text(article_information_content)

print(f"# Chunks in Research Paper: {len(research_paper_chunks)}")
print(f"# Chunks in Article Document: {len(article_information_chunks)}")

### Create Embeddings

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = "<YOUR KEY>"

embeddings = OpenAIEmbeddings()

### Create Vector Index

In [None]:
from langchain.vectorstores import FAISS

def get_doc_search(text_splitter):

    return FAISS.from_texts(text_splitter, embeddings)

In [None]:
doc_search_paper = get_doc_search(research_paper_chunks)
print(doc_search_paper)

### Start chatting with your document

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(OpenAI(), chain_type = "map_rerank",
                      return_intermediate_steps=True)

def chat_with_file(file_path, query):

    file_content = extract_file_content(file_path)
    file_splitter = text_splitter.split_text(file_content)

    document_search = get_doc_search(file_splitter)
    documents = document_search.similarity_search(query)

    results = chain({
                        "input_documents":documents,
                        "question": query
                    },
                    return_only_outputs=True)
    results = results['intermediate_steps'][0]

    return results

##### Chat with the image file

In [None]:
query = "What is the document about"

results = chat_with_file(article_information_path, query)

answer = results["answer"]
confidence_score = results["score"]

print(f"Answer: {answer}\n\nConfidence Score: {confidence_score}")

##### Chat with the PDF file

In [None]:
query = "Why is the self-attention approach used in this document?"

results = chat_with_file(research_paper_path, query)

answer = results["answer"]
confidence_score = results["score"]

print(f"Answer: {answer}\n\nConfidence Score: {confidence_score}")

# Congratulations!  

Made with ❤️ by Zoumana KEITA