In [1]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Path to your project folder
base_path = "/content/drive/MyDrive/invoice_reimbursement_system/data"


In [3]:
#Sub-paths
policy_path = os.path.join(base_path, "policy.pdf")
invoices_path = os.path.join(base_path, "invoices")

print("Policy PDF Path:", policy_path)
print("Invoices Folder Path:", invoices_path)

Policy PDF Path: /content/drive/MyDrive/invoice_reimbursement_system/data/policy.pdf
Invoices Folder Path: /content/drive/MyDrive/invoice_reimbursement_system/data/invoices


In [4]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import pdfplumber
import os


In [6]:
# List all invoice PDFs
invoice_files = [f for f in os.listdir(invoices_path) if f.endswith(".pdf")]

print("Found Invoice PDFs:")
for file in invoice_files:
    print("-", file)


Found Invoice PDFs:
- Book-cab-03.pdf
- Book-cab-01 (1).pdf
- Book-cab-02.pdf
- Book-cab-04.pdf
- Book-cab-05.pdf
- Book-cab-06.pdf
- Book-cab-09.pdf
- Book-cab-10.pdf
- Book-cab-07.pdf
- Book-cab-08.pdf
- Meal Invoice 3.pdf
- Meal Invoice 1.pdf
- Meal Invoice 2.pdf
- Meal Invoice 5.pdf
- Meal Invoice 4.pdf
- Meal Invoice 8.pdf
- Meal Invoice 6.pdf
- Meal Invoice 7.pdf
- Book 2.pdf
- Book 6 (1).pdf
- Book 5.pdf
- Book 3.pdf
- Book 7.pdf
- Book 4.pdf
- Book 1.pdf
- Book 8.pdf
- LTA Bill Template 1.pdf
- Book 9.pdf


In [7]:
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

# Extract policy text
policy_text = extract_text_from_pdf(policy_path)
print("✅ Extracted Policy Text (first 4000 chars):\n")
print(policy_text[:4000])  # Preview




✅ Extracted Policy Text (first 4000 chars):

Company Name: IAI Solution
Policy Title: Employee Reimbursement Policy
Version: 1.0
1. Purpose
The purpose of this policy is to outline the guidelines and procedures for the reimbursement of
expenses incurred by employees while performing work-related duties. This policy ensures
transparency and consistency in the reimbursement process.
2. Scope
This policy applies to all employees of IAI Solution who incur expenses in the course of their
work duties.
3. Reimbursement Categories
The following categories of expenses are eligible for reimbursement under this policy:
● Food and Beverages
● Travel Expenses
● Accommodations
4. General Guidelines
● All reimbursements must be supported by original receipts and submitted within 30 days
of the expense incurred.
● Employees must complete the reimbursement request form and submit it along with the
required documentation to the HR department.
5. Specific Expense Guidelines
5.1 Food and Beverages
● Eligi

In [8]:
# Dictionary to store invoice text
invoice_texts = {}

for file_name in invoice_files:
    file_path = os.path.join(invoices_path, file_name)
    text = extract_text_from_pdf(file_path)
    invoice_texts[file_name] = text

print(f"\n✅ Extracted text from {len(invoice_texts)} invoice PDFs.")
print("\nSample Invoice Text (first 500 chars):\n")
print(invoice_texts[invoice_files[0]][:500])  # Preview





✅ Extracted text from 28 invoice PDFs.

Sample Invoice Text (first 500 chars):

Original Tax Invoice
Driver Trip Invoice
SanjayK ServiceTaxCategory:RentingofCab
Car
2FA1622
InvoiceID3971221 InvoiceDate19Sep2024
CustomerNameSeema MobileNumber8901233212
PickupAddress #12,2ndLayoutHebbala
Description Amount(₹)
RideFee ₹141
TollConveniencefee ₹0
AirportCharges ₹0
CGST9.00% 0
SGST9.00% 0
Subtotal ₹141
₹ 141
Total
CustomerRide
Fare



In [9]:
# Save texts to local files (in Colab)
with open("policy_text.txt", "w") as f:
    f.write(policy_text)

for fname, content in invoice_texts.items():
    with open(f"invoice_{fname}.txt", "w") as f:
        f.write(content)


In [10]:
# Save all extracted invoice texts as individual .txt files
os.makedirs("extracted_invoices", exist_ok=True)

for fname, content in invoice_texts.items():
    file_name = os.path.splitext(fname)[0] + ".txt"
    with open(os.path.join("extracted_invoices", file_name), "w") as f:
        f.write(content)

print("Saved extracted invoice texts to /extracted_invoices/")


Saved extracted invoice texts to /extracted_invoices/


In [11]:
!pip install -q google-generativeai


In [16]:
import os
import google.generativeai as genai

# Set API key correctly
os.environ["GEMINI_API_KEY"] = "AIzaSyAgmicbSK8OpFlmOMNPD0bWW9RYBSfIR50"

# Configure Gemini SDK
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create model instance
model = genai.GenerativeModel("gemini-1.5-flash")  # You can also try "gemini-pro" if needed

# Example usage
def analyze_invoice_with_policy(policy_text, invoice_text):
    prompt = f"""
You are a reimbursement policy expert.
Policy: {policy_text}

Invoice:
{invoice_text}

Return exactly:
Reimbursement Status: <Fully Reimbursed | Partially Reimbursed | Declined>
Reason: <One-sentence justification>
"""
    response = model.generate_content(prompt)
    return response.text.strip()

# Test
result = analyze_invoice_with_policy(
    "Meals under ₹1000 are reimbursable.",
    "Invoice for a business dinner totaling ₹1200."
)
print(result)


Reimbursement Status: Partially Reimbursed
Reason: Only ₹1000 of the ₹1200 business dinner expense is reimbursable per policy.


In [17]:
import os
import json
import threading
import pdfplumber
import google.generativeai as genai

# -- Setup Google Gemini --
os.environ["GEMINI_API_KEY"] = "AIzaSyAgmicbSK8OpFlmOMNPD0bWW9RYBSfIR50"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

# -- PDF Text Extraction --
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

# -- Load Policy --
policy_path = "/content/drive/MyDrive/invoice_reimbursement_system/data/policy.pdf"
policy_text = extract_text_from_pdf(policy_path)
print(" Extracted Policy Text\n")

# -- Load Invoices --
invoices_path = "/content/drive/MyDrive/invoice_reimbursement_system/data/invoices"
invoice_files = os.listdir(invoices_path)
invoice_texts = {
    file_name: extract_text_from_pdf(os.path.join(invoices_path, file_name))
    for file_name in invoice_files
}
print(f" Extracted text from {len(invoice_texts)} invoice PDFs.")

# -- Gemini Analysis Function --
def analyze_invoice(file_name, invoice_text, results_dict):
    try:
        prompt = f"""
You are a reimbursement policy expert.

Policy:
{policy_text}

Invoice:
{invoice_text}

Return exactly:
Reimbursement Status: <Fully Reimbursed | Partially Reimbursed | Declined>
Reason: <One-sentence justification>
"""
        response = model.generate_content(prompt)
        results_dict[file_name] = response.text.strip()
        print(f"{file_name} analyzed.")
    except Exception as e:
        print(f" {file_name} failed: {e}")
        results_dict[file_name] = f"Error: {str(e)}"

# -- Run Analysis with Threads --
results = {}
threads = []

for file_name, text in invoice_texts.items():
    t = threading.Thread(target=analyze_invoice, args=(file_name, text, results))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

# -- Save Results --
with open("invoice_analysis_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\n Results saved to invoice_analysis_results.json")




 Extracted Policy Text





 Extracted text from 28 invoice PDFs.
Book-cab-03.pdf analyzed.
Book-cab-01 (1).pdf analyzed.
Book-cab-02.pdf analyzed.
Book-cab-04.pdf analyzed.
Book-cab-05.pdf analyzed.
Book-cab-06.pdf analyzed.
Book-cab-09.pdf analyzed.
Book-cab-10.pdf analyzed.
Book-cab-07.pdf analyzed.
Book-cab-08.pdf analyzed.




Meal Invoice 3.pdf analyzed.




Meal Invoice 1.pdf analyzed.




Meal Invoice 2.pdf analyzed.




Meal Invoice 5.pdf analyzed.




Meal Invoice 4.pdf analyzed.




Meal Invoice 8.pdf analyzed.




 Meal Invoice 6.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Meal Invoice 7.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 2.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 6 (1).pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 5.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 3.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 7.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 4.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 1.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 8.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 LTA Bill Template 1.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.




 Book 9.pdf failed: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

 Results saved to invoice_analysis_results.json


In [12]:
pip install chromadb


Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.7.0-py3-none-any.whl.metadata (5.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.55b1-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-sdk>=1.2.0 (from c

In [13]:
!pip install sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [18]:
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Load local model
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
# Imports
import os
import json
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Load local embedding model
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Paths
invoices_path = "/content/drive/MyDrive/invoice_reimbursement_system/data/invoices"
results_path = "/content/invoice_analysis_results.json"
export_path = "/content/invoice_embeddings_export.json"  # File to save embedded data

# Load results
with open(results_path, "r") as f:
    analysis_results = json.load(f)

# Setup ChromaDB with SentenceTransformer
client = chromadb.Client()
collection = client.get_or_create_collection(name="invoice_embeddings", embedding_function=embedding_function)

# Helper to extract metadata
def extract_metadata(file_name, result_text):
    lines = result_text.splitlines()
    status = reason = ""
    for line in lines:
        if "Reimbursement Status" in line:
            status = line.split(":")[1].strip()
        elif "Reason" in line:
            reason = line.split(":", 1)[1].strip()
    return {
        "file_name": file_name,
        "employee_name": file_name.split("_")[0],  # e.g., Rahul_invoice1.pdf → Rahul
        "status": status,
        "reason": reason,
        "invoice_date": "2024-01-01"
    }

# Store all vectors and collect for export
export_data = []

for file_name in os.listdir(invoices_path):
    file_path = os.path.join(invoices_path, file_name)
    with open(file_path, "rb") as f:
        text = f.read().decode(errors="ignore")

    if file_name in analysis_results:
        result_text = analysis_results[file_name]
        metadata = extract_metadata(file_name, result_text)

        vector_input = text + "\n" + result_text
        collection.add(
            documents=[vector_input],
            metadatas=[metadata],
            ids=[file_name]
        )

        # Save for export
        export_data.append({
            "id": file_name,
            "text": vector_input,
            "metadata": metadata
        })

# Write export to file
with open(export_path, "w") as f:
    json.dump(export_data, f, indent=2)

print(" All invoices embedded and stored in ChromaDB.")
print(f"Embeddings also exported to: {export_path}")


 All invoices embedded and stored in ChromaDB.
Embeddings also exported to: /content/invoice_embeddings_export.json


In [20]:
# Imports
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Load model and initialize DB
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
client = chromadb.Client()
collection = client.get_or_create_collection(name="invoice_embeddings", embedding_function=embedding_function)

def search_invoices(query, filters=None, top_k=5):
    """
    Perform vector similarity search on invoice embeddings with optional metadata filtering.

    Args:
        query (str): Search query in natural language.
        filters (dict, optional): Metadata filters. E.g., {"employee_name": "Rahul", "status": "Rejected"}.
        top_k (int): Number of top results to retrieve.

    Returns:
        list of str: Markdown-formatted results.
    """
    query_args = {
        "query_texts": [query],
        "n_results": top_k
    }

    # Format filters correctly using ChromaDB's syntax
    if filters:
        if len(filters) == 1:
            query_args["where"] = filters  # Single filter is valid
        else:
            # Wrap multiple filters in $and
            and_filters = [{k: v} for k, v in filters.items()]
            query_args["where"] = {"$and": and_filters}

    # Perform the query
    results = collection.query(**query_args)

    # Format as markdown
    formatted_results = []
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        md = f"###  Invoice: `{meta.get('file_name', 'Unknown')}`\n"
        md += f"-  **Employee**: {meta.get('employee_name', 'Unknown')}\n"
        md += f"-  **Date**: {meta.get('invoice_date', 'Unknown')}\n"
        md += f"-  **Status**: {meta.get('status', 'Unknown')}\n"
        md += f"-  **Reason**: {meta.get('reason', 'Not specified')}\n"
        md += f"\n---\n**Snippet:**\n```\n{doc[:500]}...\n```\n"
        formatted_results.append(md)

    return formatted_results


# Example 1: Query without filters
results = search_invoices("Why was Rahul's cab reimbursement partially rejected?")
for r in results:
    print(r)

# Example 2: Query with metadata filters
results = search_invoices(
    "cab invoices",
    filters={"employee_name": "Rahul", "status": "Partially Reimbursed"}
)
for r in results:
    print(r)


###  Invoice: `Book-cab-03.pdf`
-  **Employee**: Book-cab-03.pdf
-  **Date**: 2024-01-01
-  **Status**: Partially Reimbursed
-  **Reason**: The invoice amount of ₹141 exceeds the daily office cab allowance of ₹150; therefore, only ₹150 will be reimbursed.

---
**Snippet:**
```
%PDF-1.4
%
1 0 obj
<</Title (Book-cab-03)
/Producer (Skia/PDF m132 Google Docs Renderer)>>
endobj
3 0 obj
<</ca 1
/BM /Normal>>
endobj
8 0 obj
<</CA 1
/ca 1
/LC 0
/LJ 0
/LW 1.33333337
/ML 10
/SA true
/BM /Normal>>
endobj
9 0 obj
<</Filter /FlateDecode
/Length 2062>> stream
xZn#}WHYE X`$$@y9ud5{^b[ݖIyZt:)UϸH_~W9S{#;SV;o^Yܿ",-xM΅YC2BfuX]$J&-0S2>$[6MFR`$sn4]	PY*k-ݒXıdDfqd[h*kE1c^y+q	CԖjaܲUI\0>1J5OM]QK-."5Wk%
Q89v e4hgQF2ՉJXb-inᢛťW5	Q@#l:AVv^d$hgreh-ZZƩZ,9yE.Ž...
```

###  Invoice: `Book-cab-01 (1).pdf`
-  **Employee**: Book-cab-01 (1).pdf
-  **Date**: 2024-01-01
-  **Status**: Partially Reimbursed
-  **Reason**: The invoice total of ₹233 exceeds the daily office cab allowance 

In [14]:
pip install transformers sentence-transformers chromadb




In [21]:
# Imports
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from transformers import pipeline

# Initialize SentenceTransformer for embeddings
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
client = chromadb.Client()
collection = client.get_or_create_collection(
    name="invoice_embeddings", embedding_function=embedding_function
)

# Vector Search Function (already implemented)
def search_invoices(query, filters=None, top_k=5):
    query_args = {
        "query_texts": [query],
        "n_results": top_k
    }
    if filters:
        query_args["where"] = {"$and": [{"employee_name": filters["employee_name"]}, {"status": filters["status"]}]}

    results = collection.query(**query_args)

    formatted_results = []
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        md = f"###  Invoice: `{meta['file_name']}`\n"
        md += f"- **Employee**: {meta['employee_name']}\n"
        md += f"- **Date**: {meta['invoice_date']}\n"
        md += f"- **Status**: {meta['status']}\n"
        md += f"- **Reason**: {meta['reason']}\n"
        md += f"\n---\n**Snippet:**\n```\n{doc[:500]}...\n```\n"
        formatted_results.append(md)

    return formatted_results

# ✅ Local HuggingFace RAG Pipeline
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def rag_chatbot_local(query, top_k=3):
    """
    Local Retrieval-Augmented Generation (RAG) chatbot using Hugging Face's flan-t5-base.
    """
    retrieved_docs = search_invoices(query, top_k=top_k)
    context = "\n\n".join(retrieved_docs)

    prompt = f"""You are an expert in invoice reimbursement policies.
Based on the invoice data below, answer the user's question briefly.

--- INVOICE DATA START ---
{context}
--- INVOICE DATA END ---

Question: {query}
Answer:"""

    result = qa_pipeline(prompt, max_new_tokens=200)
    return result[0]['generated_text']

# ✅ Example usage
query = "Why was Rahul's cab reimbursement partially rejected?"
answer = rag_chatbot_local(query)
print("💬 Answer:\n", answer)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1371 > 512). Running this sequence through the model will result in indexing errors


💬 Answer:
 The invoice amount of 141 exceeds the daily office cab allowance of 150; therefore, only 150 will be reimbursed. --- **Snippet:**  %PDF-1.4 % 1 0 obj /Title (Book 6) /Producer (Skia/PDF m132 Google Docs Renderer)>> endobj 3 0 obj /ca 1 /BM /Normal>> endobj 7 0 obj /CA 1 /ca 1 /LC 0 /LJ 0 /LW 1.33333337 /ML 10 /SA true /BM /Normal>> endobj 8 0 obj /CA 1 /ca 1 /LC 0 /LJ


In [15]:
!pip install fastapi uvicorn nest-asyncio pyngrok chromadb sentence-transformers transformers


Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.11


In [None]:
#new FASTAPI setup

In [27]:
# ✅ Step 1: Add Ngrok Auth Token
!ngrok config add-authtoken 2yMN4GMKS6s6c4qH5iSCm2ieiIR_3KGhaEMmUbndyr7ZjVPe4

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [28]:
from huggingface_hub import login

# Paste your Hugging Face token here
login("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")


In [26]:
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from pyngrok import ngrok
import nest_asyncio
import uvicorn
import shutil
import os

#  Step 3: Load RAG/LLM Components
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load Mistral model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", device_map="auto", trust_remote_code=True)
model.eval()

#  ChromaDB
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
client = Client()
collection = client.get_or_create_collection(name="invoice_embeddings", embedding_function=embedding_function)

# storage function
def embed_and_store(text, metadata):
    collection.add(documents=[text], metadatas=[metadata], ids=[metadata["filename"]])

# search function
def search_similar(query, top_k=1):
    results = collection.query(query_texts=[query], n_results=top_k)
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    return [{"text": docs[i], "metadata": metas[i]} for i in range(len(docs))]

# Same answer generator
def generate_answer(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    output = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(output[0], skip_special_tokens=True).split("Answer:")[-1].strip()

# Step 4: FastAPI App
app = FastAPI()

@app.post("/upload/")
async def upload_file(file: UploadFile = File(...)):
    contents = await file.read()
    text = contents.decode("utf-8")

    embed_and_store(text, {"filename": file.filename})
    return {"status": "File processed and embedded", "filename": file.filename}

@app.post("/ask/")
async def ask_question(query: str):
    results = search_similar(query)
    if not results:
        return JSONResponse(status_code=400, content={"error": "No relevant documents found."})

    context = results[0]["text"]
    answer = generate_answer(context, query)
    return {"query": query, "answer": answer, "source": results[0]["metadata"]}


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [29]:
# Nest asyncio loop
nest_asyncio.apply()

# Kill old tunnels if any
ngrok.kill()

# Open ngrok tunnel to the FastAPI app
public_url = ngrok.connect(8000)
print(f"🚀 Your public FastAPI URL: {public_url}")
print(f"📄 Swagger Docs UI: {public_url}/docs")

# Run FastAPI app in current process (Colab-friendly)
uvicorn.run(app, host="0.0.0.0", port=8000)

🚀 Your public FastAPI URL: NgrokTunnel: "https://6da0-34-145-164-198.ngrok-free.app" -> "http://localhost:8000"
📄 Swagger Docs UI: NgrokTunnel: "https://6da0-34-145-164-198.ngrok-free.app" -> "http://localhost:8000"/docs


INFO:     Started server process [1726]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     223.226.174.98:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     223.226.174.98:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1726]


In [None]:
# 👉 https://42eb-34-71-196-200.ngrok-free.app/docs