In [None]:
# ! uv pip install weaviate-client langchain tiktoken pypdf rapidocr-onnxruntime
# ! uv pip install --upgrade protobuf
# ! uv pip install --upgrade google-api-core google-cloud-core googleapis-common-protos
# ! uv pip install --upgrade google-generativeai langchain-google-genai
# ! uv pip install sentence-transformers
# ! uv pip install weaviate-client==3.26.7
# ! uv pip install langchain pgvector psycopg[binary] sqlalchemy

# ! uv pip uninstall weaviate-client
# ! uv pip install weaviate-client==3.26.2
# ! uv pip install --upgrade langchain

# ! uv pip install psycopg2-binary

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
from langchain.vectorstores.pgvector import PGVector

In [None]:
from langchain.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

#### **Loading Different Types of PDFs with LangChain**

You can load multiple types of PDF documents using LangChain. Just refer to the official documentation for supported formats and loaders:

🔗 [LangChain PDF Document Loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf/)

In [18]:
from langchain.document_loaders import PyMuPDFLoader
loader=PyMuPDFLoader(r"C:\Users\Admin\Documents\Aptitude\percetnages_merged.pdf", extract_images=True)
pages=loader.load()



In [19]:
pages

[Document(metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Documents\\Aptitude\\percetnages_merged.pdf', 'file_path': 'C:\\Users\\Admin\\Documents\\Aptitude\\percetnages_merged.pdf', 'total_pages': 19, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-12T14:43:26+00:00', 'trapped': '', 'modDate': 'D:20250612144326Z', 'creationDate': '', 'page': 0}, page_content='1\nIntroduction\n2\nPercentage Splitting Technique\nThe percentage splitting method involves breaking down a percentage into smaller, man-\nageable parts, calculating each part, and summing the results.\n2.1\nSteps\n1. Identify the whole value and the percentage to calculate.\n2. Break the percentage into simpler parts (e.g., 50%, 25%, 10%, 5%, 1%).\n3. Calculate each part based on the whole value.\n4. Sum the parts to ﬁnd the ﬁnal result.\n2.2\nCommon Splits\n• 100% = Whole value.\n• 50% = Whole ÷ 2.\n• 25% = 50% ÷ 2.\n• 10% = Who

In [20]:
len(pages)

19

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs=text_splitter.split_documents(pages)
docs

[Document(metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Documents\\Aptitude\\percetnages_merged.pdf', 'file_path': 'C:\\Users\\Admin\\Documents\\Aptitude\\percetnages_merged.pdf', 'total_pages': 19, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-12T14:43:26+00:00', 'trapped': '', 'modDate': 'D:20250612144326Z', 'creationDate': '', 'page': 0}, page_content='1\nIntroduction\n2\nPercentage Splitting Technique\nThe percentage splitting method involves breaking down a percentage into smaller, man-\nageable parts, calculating each part, and summing the results.\n2.1\nSteps\n1. Identify the whole value and the percentage to calculate.\n2. Break the percentage into simpler parts (e.g., 50%, 25%, 10%, 5%, 1%).\n3. Calculate each part based on the whole value.'),
 Document(metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Documents\\Aptitude\\per

In [31]:
from config import Config
import psycopg2

# Step 3: Store in NeonDB using pgvector
vector_db = PGVector.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name="my_pgvector_collection",
    connection_string=Config.PGVECTOR_URL,
)


In [None]:

# Import required libraries
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Weaviate
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
import weaviate

# Load environment variables
load_dotenv()

# Initialize Weaviate client
weaviate_url = os.environ.get("WEAVIATE_URL", "http://localhost:8080")
weaviate_api_key = os.environ.get("WEAVIATE_API_KEY")

if weaviate_api_key:
    client = weaviate.Client(
        url=weaviate_url,
        auth_client_secret=weaviate.AuthApiKey(weaviate_api_key)
    )
else:
    client = weaviate.Client(url=weaviate_url)

# Verify connection
print("Weaviate Connected:", client.is_ready())

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load and process PDF
pdf_path = r"C:\Users\Admin\Documents\Aptitude\percetnages_merged.pdf"
loader = PyMuPDFLoader(pdf_path, extract_images=True)
documents = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_documents(documents)

# Create Weaviate schema
schema = {
    "class": "Document",
    "properties": [
        {
            "name": "content",
            "dataType": ["text"]
        },
        {
            "name": "metadata",
            "dataType": ["object"]
        }
    ],
    "vectorizer": "none"
}

# Delete existing class if it exists
if client.schema.exists("Document"):
    client.schema.delete_class("Document")

# Create schema
client.schema.create_class(schema)

# Initialize Weaviate vector store
vectorstore = Weaviate(
    client=client,
    index_name="Document",
    text_key="content",
    embedding=embeddings,
    attributes=["metadata"]
)

# Add documents to vector store
vectorstore.add_documents(chunks)

# Initialize ChatGroq
llm = ChatGroq(
    model="mixtral-8x7b-32768",
)

# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)

# Example query
query = "How do you calculate 62.5% of 160 using percentage splitting?"
result = qa_chain({"query": query})

# Print results
print("\nQuery:", query)
print("Answer:", result["result"])
print("\nSource Documents:")
for doc in result["source_documents"]:
    print(f"- Page {doc.metadata['page']}: {doc.page_content[:200]}...")
