In [37]:
import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

In [38]:
# Load environment variables and configure the API key
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)
print(api_key)

AIzaSyBo5qBCalr1gOpFdJJHgGQ-AqzlAflqVAU


In [39]:
# Configure the Tesseract command to the path where Tesseract-OCR is installed
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [40]:
# Path to the PDF file
PDF_FILE_PATH = "C:/Users/abdul/Desktop/app CD Code/Wiki Document.pdf"

In [41]:
def get_pdf_text(pdf_path):
    """ Extract text from a PDF using PyMuPDF, using OCR if text extraction fails. """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        extracted_text = page.get_text()
        if extracted_text.strip():  # Check if there is text
            text += extracted_text
        else:
            # Convert the page to an image and use OCR to extract text
            image = convert_page_to_image(page)
            ocr_text = pytesseract.image_to_string(image)
            text += ocr_text
    doc.close()
    return text


In [42]:
def convert_page_to_image(page):
    """ Convert a PDF page into an image """
    zoom = 2    # Increase the resolution of the image
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    img = Image.open(io.BytesIO(pix.tobytes('ppm')))
    return img

In [43]:
def get_text_chunks(text):
    """ Split large text into manageable chunks. """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

In [44]:
def get_vector_store(text_chunks):
    """ Create and save a vector store from text chunks. """
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

In [45]:
def get_conversational_chain():
    """ Set up a conversational model to answer queries based on text context. """
    prompt_template = """
    If the answer is available in the context, provide a detailed response. If not, respond with 'I don't know.'\n\n
    Context:\n{context}?\n
    Question: \n{question}\n
    Answer:
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

In [46]:
def main():
    print("Loading PDF content...")
    raw_text = get_pdf_text(PDF_FILE_PATH)
    if raw_text:
        print("Processing text into chunks...")
        text_chunks = get_text_chunks(raw_text)
        if text_chunks:
            print("Creating and storing vector data...")
            get_vector_store(text_chunks)
            embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
            new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
            chain = get_conversational_chain()

            while True:
                user_question = input("Ask a question (or type 'exit' to quit): ")
                if user_question.lower() == 'exit':
                    print("Exiting...")
                    break
                print("Searching for relevant documents...")
                docs = new_db.similarity_search(user_question)
                if docs:
                    print("Generating response...")
                    response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
                    print("Reply:", response["output_text"])
                else:
                    print("Reply: I don't know.")
        else:
            print("No usable text chunks were created from the PDF.")
    else:
        print("No text could be extracted from the PDF. Please check the PDF content.")


In [47]:
if __name__ == "__main__":
    main()

Loading PDF content...
Processing text into chunks...
Creating and storing vector data...
Searching for relevant documents...
Generating response...
Reply: Norma McCorvey was the pseudonym of Jane Roe, the plaintiff in the Roe v. Wade case.
Searching for relevant documents...
Generating response...
Reply: This question cannot be answered from the given context.
Searching for relevant documents...
Generating response...
Reply: This context does not mention anything about Linda Coffee, so I cannot answer this question from the provided context.
Searching for relevant documents...
Generating response...
Reply: Yes, Sarah Weddington was Norma McCorvey's lawyer.
Searching for relevant documents...
Generating response...
Reply: Abortion was a fairly common practice in the history of the United States, and was not always a public controversy. At a time when society was more concerned with the serious consequences of women becoming pregnant out of wedlock, family affairs were handled out of pu