**Step 1: Extract Page-Level Content from PDF**





In [5]:
!pip install PyPDF2



In [8]:
from PyPDF2 import PdfReader

reader = PdfReader("/content/Test Blob File.pdf")
pages = [page.extract_text() for page in reader.pages]
doc_pages = [{"page_num": i, "text": p} for i, p in enumerate(pages)]
doc_pages

[{'page_num': 0,
  'text': 'Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.\nFee Details and Summary\nApplicants: Application No:\nDate Prepared:\nLoan Program:Prepared By:\nTHIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees W orksheet" is provided for informational purposes ONLY, to assist\nyou in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage \npayment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.\nTotal Loan Amount:  Interest Rate: Term/Due In:\nFee Paid To Paid By (Fee Split**) Amount PFC / F / POC\nTOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT:\nTotal Estimated Funds Total Monthly PaymentPurchase Price (+)\nAlterations (+)\nLand (+)\nRefi (incl. debts to be paid off) (+)\nEst. Prepaid Items/Reserves (+)\nEst. Closing Costs (+)Loan Amount (-) Principal & Interest\nOther Financin

**Step 2: Write the "Same Document?" Function with RAG**

In [9]:
def gemini_model(prompt):
    import google.generativeai as genai

    genai.configure(api_key="AIzaSyA1gxhmL5t-RzooqY2VKy3B6AsZUNdKqx0")

    model = genai.GenerativeModel("models/gemini-2.0-flash")
    response = model.generate_content(prompt)

    return response.text


In [10]:
def is_same_document(prev_text, curr_text, doc_type=None):
    prompt = f"""
    You are checking whether two pages belong to the same document.
    Previous page type: {doc_type or 'unknown'}

    Previous Page:
    {prev_text}

    Current Page:
    {curr_text}

    Answer ONLY 'Yes' or 'No'. Do NOT explain.
    """
    response = gemini_model(prompt)  # Swap in LLM call
    return response.strip().lower().startswith("yes")


prev_text = doc_pages[2]["text"]
curr_text = doc_pages[0]["text"]
doc_type = "Resume"  # Optional, can be "unknown" or None

is_same_document(prev_text, curr_text, doc_type)


False

**Step 3: Write the Document Type Classifier**

In [11]:
def classify_document_type(text):
    prompt = f"""
    This is the start of a new document. Based on the content, classify it.

    Page Content:
    {text}

    Choose from: Resume, Contract, Lender Fee Sheet, ID, PaySlip, Other.
    Just respond with the type.
    """
    response = gemini_model(prompt).strip().lower().replace(".", "")
    result = response.title() # Capitalize the first letter of each word
    return result

classify_document_type(doc_pages[3]["text"])

'Contract'

**Step 4: Loop Through Pages and Generate Page-Level Metadata**

In [12]:
results = []
current_doc_type = None
doc_counter = 0

for i, page in enumerate(doc_pages):
    if i == 0:
        current_doc_type = classify_document_type(page["text"])
    else:
        prev_text = doc_pages[i - 1]["text"]
        same = is_same_document(prev_text, page["text"], current_doc_type)
        if not same:
            doc_counter += 1
            current_doc_type = classify_document_type(page["text"])

    results.append({
        "page": i,
        "doc_id": doc_counter,
        "doc_type": current_doc_type
    })


for r in results:
    print(r)

{'page': 0, 'doc_id': 0, 'doc_type': 'Lender Fee Sheet'}
{'page': 1, 'doc_id': 1, 'doc_type': 'Payslip'}
{'page': 2, 'doc_id': 2, 'doc_type': 'Contract'}
{'page': 3, 'doc_id': 2, 'doc_type': 'Contract'}
{'page': 4, 'doc_id': 2, 'doc_type': 'Contract'}
{'page': 5, 'doc_id': 2, 'doc_type': 'Contract'}
{'page': 6, 'doc_id': 2, 'doc_type': 'Contract'}


**Step 5: Visualize Results**

In [13]:
import pandas as pd

df = pd.DataFrame(results)
df.head()

Unnamed: 0,page,doc_id,doc_type
0,0,0,Lender Fee Sheet
1,1,1,Payslip
2,2,2,Contract
3,3,2,Contract
4,4,2,Contract


# P7.2

In [7]:
#Download the necessary packages
!pip install llama-index
!pip install llama-index-readers-file
!pip install llama-index llama-index-embeddings-huggingface transformers sentence-transformers
!pip install llama-index-embeddings-huggingface

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl.metadata (458 bytes)
Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl (8.9 kB)
Installing collected packages: llama-index-embeddings-huggingface
Successfully installed llama-index-embeddings-huggingface-0.6.1


^C


**1. Loading a multi-page PDF**

In [1]:
from llama_index.readers.file import PDFReader

loader = PDFReader()
pages = loader.load_data("/content/DHRUVIN DUNGRANI_RESUME.pdf")  # Returns one Document per page

# Print preview
print(f"Loaded {len(pages)} pages")
print(pages[0].text[:300])

Loaded 1 pages
DHRUVIN DUNGRANI 
Mumbai 400067, India| dhruvindungrani@gmail.com | LinkedIn 
 
EDUCATION 
School of Business Management, NMIMS                                                                                                                       July 2025 – April 2027 
MBA Business Analytics  
D.J. 


**2. Adding metadata like doc_type and page_number**

In [2]:
documents = []
for i, doc in enumerate(pages):
    doc.metadata = {
        "page_number": i + 1,
        "source_file": "sample_blob.pdf"
    }
    documents.append(doc)


In [3]:
doc_type_array = ["Resume", "Resume", "Lender Fees", "Contract", "Contract"]

for doc, doc_type in zip(documents, doc_type_array):
    doc.metadata["doc_type"] = doc_type


In [4]:
for doc in documents[:2]:
    print(doc.metadata)
    print(doc.text[:150], "\n---\n")

{'page_number': 1, 'source_file': 'sample_blob.pdf', 'doc_type': 'Resume'}
DHRUVIN DUNGRANI 
Mumbai 400067, India| dhruvindungrani@gmail.com | LinkedIn 
 
EDUCATION 
School of Business Management, NMIMS                        
---



**3. Indexing and retrieving with metadata filters**

In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Use a local embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

# Build a vector index enriched with metadata (doc_type, page_number, source_file)
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

print("Metadata-enriched index created with", len(documents), "documents.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Metadata-enriched index created with 1 documents.


In [7]:
# Query the full index
retriever = index.as_retriever()
query = "What information is provided on the employee's payslip?"
all_results = retriever.retrieve(query)

# Filter results using metadata (Lambda-style filtering)
filtered_results = [
    r for r in all_results if r.metadata.get("doc_type") == "Contract"
]

# Display the filtered results
for i, r in enumerate(filtered_results):
    print(f"--- Result {i+1} ---")
    print(r.text[:500])
    print("Metadata:", r.metadata)
    print("\n")
