In [16]:
# !pip install -U langchain-community
# !pip install langchain_huggingface
# ! pip install pypdf
# !pip install chromadb

In [22]:
import shutil
import tempfile
from google.colab import userdata
from google.colab import files
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

In [26]:
import os
# Disable TensorFlow imports to avoid modeling_tf_utils error
os.environ["TRANSFORMERS_NO_TF"] = "1"

# Set Hugging Face API token from Colab Secrets
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

In [23]:
def load_and_process_document(pdf_path):
    """
    Load and process PDF document with metadata
    """
    # Load PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Add metadata
    for i, doc in enumerate(documents):
        doc.metadata = {
            "source": pdf_path,
            "page": i + 1,
            "document_type": "pdf"
        }

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

def create_vector_store(chunks):
    """
    Create and populate vector store using Hugging Face embeddings
    """
    # Create a temporary directory with proper permissions
    persist_directory = tempfile.mkdtemp()
    print(f"Creating Chroma DB at: {persist_directory}")

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}  # Colab uses CPU by default
    )

    # Create new vector store with proper permissions
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )

    return vector_store, persist_directory

def setup_conversational_chain(vector_store):
    """
    Set up conversational retrieval chain with memory
    """
    # Initialize Hugging Face LLM with correct task
    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
        task="text-generation",  # Specify the task explicitly
        temperature=0.1,
        max_new_tokens=512
        # Uses os.environ["HUGGINGFACEHUB_API_TOKEN"] automatically
    )

    # Create memory for conversation history
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True
    )

    # Define prompt template
    prompt_template = PromptTemplate(
        input_variables=["chat_history", "question", "context"],
        template="""You are a knowledgeable assistant. Use the following context and chat history to answer the question concisely and accurately.

        Context: {context}

        Chat History: {chat_history}

        Question: {question}

        Answer: """
    )

    # Create conversational chain
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 3}
        ),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt_template}
    )

    return chain

def setup_qa_system(chunks):
    """
    Alternative approach without using persistent Chroma DB
    """
    # Use HuggingFace embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )

    # Create in-memory vector store
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=None  # In-memory store
    )

    # Initialize LLM
    llm = HuggingFaceEndpoint(
        repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
        task="text-generation",
        temperature=0.1,
        max_new_tokens=512
    )

    # Create memory
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True
    )

    # Create prompt template
    prompt_template = PromptTemplate(
        input_variables=["chat_history", "question", "context"],
        template="""You are a knowledgeable assistant. Use the following context and chat history to answer the question concisely and accurately.

        Context: {context}

        Chat History: {chat_history}

        Question: {question}

        Answer: """
    )

    # Create chain
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 3}
        ),
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt_template}
    )

    return qa_chain

def main():
    # Upload PDF in Colab
    print("Please upload a PDF file:")
    uploaded = files.upload()
    pdf_path = list(uploaded.keys())[0]  # Get the uploaded PDF's filename

    # Process document
    print("Processing document...")
    chunks = load_and_process_document(pdf_path)

    # Set up QA system without persistence issues
    print("Setting up QA system...")
    qa_chain = setup_qa_system(chunks)

    # Example interaction loop
    while True:
        question = input("\nAsk a question about the document (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break

        try:
            # Direct invocation of QA chain
            response = qa_chain.invoke({"question": question})
            print("\nAnswer:", response["answer"])
        except Exception as e:
            print(f"Error: {e}")
            print("Try asking a different question or rephrase your query.")


In [24]:
if __name__ == "__main__":
    main()

Please upload a PDF file:


Saving basics-of-data-science-kpk.pdf to basics-of-data-science-kpk (4).pdf
Processing document...
Setting up QA system...

Ask a question about the document (or 'quit' to exit): what is data science?





Answer: 

Data science is a multidisciplinary field that combines mathematical, statistical, and computational methods to extract knowledge and insights from structured and unstructured data. It is used to analyze large amounts of data to extract meaningful insights for business decision-making. Data science can help detect fraud, prevent monetary losses, build intelligence in machines, enable better and faster decision-making, and recommend the right products to the right customers.

Ask a question about the document (or 'quit' to exit): quit
