In [4]:
# !pip uninstall -y langchain langchain_openai langchain_community langchain_core numpy tensorflow
# !pip install --quiet -U openai pypdf chromadb
# !pip install --quiet  -U langchain-community langchain-openai langchain numpy numba

In [6]:
# Import required libraries after ensuring they're installed properly
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain.memory import ConversationBufferMemory

In [8]:
# from google.colab import userdata
# # Set OpenAI API key from Colab secrets
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [9]:
def load_and_process_document(pdf_path):
    """
    Load and process PDF document with metadata

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        list: List of document chunks
    """
    # Load PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Add metadata to each page
    for i, doc in enumerate(documents):
        doc.metadata = {
            "source": pdf_path,
            "page": i + 1,
            "document_type": "pdf"
        }

    # Split documents into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True
    )
    chunks = text_splitter.split_documents(documents)

    return chunks

In [10]:
def create_qa_system(chunks):
    """
    Create a QA system using OpenAI models

    Args:
        chunks: Document chunks

    Returns:
        retrieval_chain: Chain for answering questions
        memory: Conversation memory
    """
    # Initialize embeddings
    embeddings = OpenAIEmbeddings()

    # Create vector store
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings
    )

    # Create retriever
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})

    # Initialize language model
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

    # Create prompt template
    prompt = ChatPromptTemplate.from_template("""
    You are a helpful assistant that answers questions based on provided documents.

    Context information from documents:
    {context}

    Answer the following question based only on the provided context:
    {input}

    If the answer cannot be found in the context, say "I don't have enough information to answer this question."
    """)

    # Create document chain
    document_chain = create_stuff_documents_chain(llm, prompt)

    # Create retrieval chain
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    # Create memory for tracking conversation history
    memory = ConversationBufferMemory()

    return retrieval_chain, memory


In [11]:
def main():
    """Main function to orchestrate the PDF QA system"""
    print("=" * 50)
    print("PDF Question Answering System using OpenAI")
    print("=" * 50)

    # Upload PDF in Colab
    print("\nPlease upload a PDF file:")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded. Exiting.")
        return

    pdf_path = list(uploaded.keys())[0]  # Get the uploaded PDF's filename

    # Process document
    print("\nProcessing document...")
    chunks = load_and_process_document(pdf_path)
    print(f"Document processed into {len(chunks)} chunks.")

    # Create QA system
    print("\nSetting up QA system...")
    qa_chain, memory = create_qa_system(chunks)
    print("QA system ready!")

    # Example interaction loop
    print("\n" + "=" * 50)
    print("You can now ask questions about the document.")
    print("Type 'quit' to exit.")
    print("=" * 50)

    chat_history = []

    while True:
        question = input("\nYour question: ")
        if question.lower() in ['quit', 'exit', 'bye']:
            print("\nThank you for using the PDF QA system. Goodbye!")
            break

        try:
            # Process question and get response
            response = qa_chain.invoke({"input": question})
            answer = response["answer"]
            print("\nAnswer:", answer)

            # Update conversation history
            chat_history.append((question, answer))
            memory.save_context({"input": question}, {"output": answer})

        except Exception as e:
            print(f"\nError: {e}")
            print("Please try rephrasing your question or check your API key.")


In [None]:
if __name__ == "__main__":
    main()