In [None]:
import fitz 
import chromadb
import numpy as np
import os
import tkinter as tk
from tkinter import scrolledtext
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
PDF_FILE = "ICC_handbook.pdf"  
DB_PATH = "./chroma_db"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"


In [None]:
chroma_client = chromadb.PersistentClient(path=DB_PATH)
collection = chroma_client.get_or_create_collection(name="legal_texts")


In [None]:
model = SentenceTransformer(EMBEDDING_MODEL)

# Extract Data from PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from PDF and returns as a single string."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text


# DATA CHUNKS

In [None]:
def split_text(text, chunk_size=500, chunk_overlap=100):
    """Splits text into chunks for embedding."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# EMBEDDINGS IN CHROMADB

In [None]:
def store_embeddings(chunks):
    """Generates and stores embeddings in ChromaDB."""
    embeddings = model.encode(chunks, show_progress_bar=True)
    for i, text in enumerate(chunks):
        collection.add(
            ids=[str(i)], documents=[text], embeddings=[embeddings[i].tolist()]
        )
    print("Embeddings stored successfully!")


In [None]:
def query_chroma(user_input, top_k=3):
    """Queries ChromaDB for the closest legal text based on user input."""
    query_embedding = model.encode(user_input).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    return results["documents"]

In [None]:
if not os.path.exists(DB_PATH) or not collection.count():
    print("Database not found! Extracting and storing embeddings...")
    pdf_text = extract_text_from_pdf(PDF_FILE)
    text_chunks = split_text(pdf_text)
    store_embeddings(text_chunks)
else:
    print("ChromaDB already has embeddings. Ready to query.")


# BUILD TKINTER GUI

In [None]:
def search_legal_text():
    """Handles the search button click and updates the result box."""
    user_query = entry.get()
    results = query_chroma(user_query)
    
    output_box.config(state=tk.NORMAL)  # Enable editing
    output_box.delete(1.0, tk.END)  # Clear previous output
    if results:
        output_box.insert(tk.END, "Top Relevant Sections:\n\n")
        for i, result in enumerate(results):
            output_box.insert(tk.END, f"{i+1}. {result}\n\n{'='*50}\n\n")
    else:
        output_box.insert(tk.END, "No relevant legal text found.")
    output_box.config(state=tk.DISABLED)  # Disable editing


# Tkinter GUI

In [None]:
root = tk.Tk()
root.title("Legal Text Classifier")
root.geometry("700x500")

label = tk.Label(root, text="Enter Legal Query:", font=("Arial", 12))
label.pack(pady=5)

entry = tk.Entry(root, width=60, font=("Arial", 12))
entry.pack(pady=5)

search_button = tk.Button(root, text="Search", command=search_legal_text, font=("Arial", 12))
search_button.pack(pady=5)

output_box = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=80, height=20, font=("Arial", 10))
output_box.pack(pady=5)
output_box.config(state=tk.DISABLED)  # Make it read-only

root.mainloop()