#Install Required Packages

In [None]:
!pip install faiss-cpu PyPDF2 nltk sentence-transformers google-generativeai


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.8/23.8 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.13.2


#Import Modules and API Configuration

In [None]:
import os
import re
import faiss
import nltk
import PyPDF2
import numpy as np
import google.generativeai as genai
from google.colab import userdata, files
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer # Fixed import syntax

# Download NLTK data
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Configure Gemini
try:
    api_key = userdata.get("GOOGLE_API_KEY").strip()
    genai.configure(api_key=api_key)
    gemini_model = genai.GenerativeModel("gemini-2.5-flash")
    print("Gemini Configured Successfully.")
except Exception as e:
    print(f"Error configuring Gemini: {e}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Gemini Configured Successfully.


#Load & Preprocess PDF Documents from Google Drive

In [None]:
import os
import glob
import PyPDF2
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define path to your folder
# NOTE: Make sure the folder 'Legal Policy' is in your 'My Drive'
folder_path = '/content/drive/My Drive/Legal Policy'

# 3. Read PDFs directly from the folder
documents = []
sources = []

print(f"Reading files from: {folder_path}")

# Check if folder exists
if not os.path.exists(folder_path):
    print(f"ERROR: Folder not found at {folder_path}")
    print("Please check if the folder name matches exactly.")
else:
    pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))

    if not pdf_files:
        print("No PDF files found in the folder.")

    for file_path in pdf_files:
        try:
            filename = os.path.basename(file_path)
            print(f"Loading: {filename}...")

            text = ""
            reader = PyPDF2.PdfReader(file_path)
            for page in reader.pages:
                if page.extract_text():
                    text += page.extract_text()

            # Apply cleaning/marking functions immediately
            # (Ensure 'mark_legal_units' and 'clean_text' are defined in a previous cell)
            if 'mark_legal_units' in globals() and 'clean_text' in globals():
                text = mark_legal_units(text)
                text = clean_text(text)

            documents.append(text)
            sources.append(filename) # Store filename as source

        except Exception as e:
            print(f"Failed to read {filename}: {e}")

    print(f"\nSuccessfully loaded {len(documents)} documents.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading files from: /content/drive/My Drive/Legal Policy
Loading: NEP_Final_English_0.pdf...
Loading: Constitution of India.pdf...
Loading: IPC.pdf...
Loading: the_code_of_criminal_procedure,_1973.pdf...
Loading: rti_act_2005.pdf...
Loading: it_act_2000_updated.pdf...
Loading: consumer protection act.pdf...

Successfully loaded 7 documents.


#Chunking, Embeddings & FAISS Index Construction

In [None]:
# Initialize embedder (Fixed missing assignment)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def better_chunking(natural_text):
    parts = re.split(r"(ARTICLE\s+\d+|SECTION\s+\d+)", natural_text, flags=re.IGNORECASE)
    chunks = []
    # Fixed loop logic and index checking
    for i in range(1, len(parts), 2):
        if i + 1 < len(parts):
            combined = parts[i] + parts[i+1] # Fixed missing assignment
            if len(combined.split()) > 20:
                chunks.append({
                    "raw_text": combined,
                    "search_text": clean_text(combined)
                })
    return chunks

# --- REBUILD INDEX ---
all_chunk_data = []
chunk_sources = []

# Ensure 'documents' and 'sources' exist from your file upload cell
if 'documents' in globals() and 'sources' in globals():
    for doc, src in zip(documents, sources):
        doc_chunks = better_chunking(doc)
        all_chunk_data.extend(doc_chunks)
        chunk_sources.extend([src] * len(doc_chunks))

    # Generate Embeddings
    search_texts = [c["search_text"] for c in all_chunk_data]
    if search_texts:
        embeddings = embedder.encode(search_texts, show_progress_bar=True)
        embeddings = np.array(embeddings)

        # Create FAISS Index
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)
        print("Vectors indexed:", index.ntotal)
    else:
        print("No text chunks found to index.")
else:
    print("Please run the file upload and PDF reading cell first.")

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Vectors indexed: 1321


#Gemini Legal Text Simplification

In [None]:
def gemini_translate_with_intent(text, intent="general"):
    try:
        if intent == "procedural":
            prompt = (
                "Rewrite the following legal text into a clear, simple explanation "
                "of the procedure. Do not add new information.\n\n"
                f"{text}"
            )
        elif intent == "penalty":
            prompt = (
                "Rewrite the following legal text into a simple explanation "
                "focusing only on the punishment mentioned. Do not add new information.\n\n"
                f"{text}"
            )
        else: # conceptual / general
            prompt = (
                "Rewrite the following legal text in simple language for understanding. "
                "Do not add or remove legal meaning.\n\n"
                f"{text}"
            )

        # Call Gemini
        response = gemini_model.generate_content(prompt)

        if response and response.text:
            return response.text.strip()
        else:
            return "Explanation (from law text): " + text[:500]

    except Exception as e:
        # Fallback in case of API error
        print(f"Gemini API Error: {e}")
        return "Explanation (from law text): " + text[:500]

#Law Detection & Reference Extraction

In [None]:
import re

# 1. Define Keyword Mappings for specific laws
LAW_FILE_KEYWORDS = {
    "rti": ["rti", "right_to_information"],
    "consumer": ["consumer"],
    "it": ["it_act", "information_technology", "cyber"],
    "ipc": ["penal", "ipc"],
    "crpc": ["criminal_procedure", "crpc"],
    "constitution": ["constitution"],
    "nep": ["education", "nep"]
}

# 2. Function to detect which law the user is asking about
def detect_law(query):
    q = query.lower()
    if "rti" in q or "information" in q:
        return "rti"
    if "consumer" in q:
        return "consumer"
    if "cyber" in q or "computer" in q or "online" in q:
        return "it"
    if "penal" in q or "ipc" in q or "offence" in q or "crime" in q:
        return "ipc"
    if "criminal procedure" in q or "crpc" in q or "bail" in q or "fir" in q:
        return "crpc"
    if "constitution" in q or "fundamental" in q or "article" in q:
        return "constitution"
    if "education" in q or "nep" in q:
        return "nep"
    return "general"

# 3. Function to extract section numbers from text (for citations)
def extract_refs(text):
    # Find all numbers that look like section/article numbers
    nums = re.findall(r"\b\d+\b", text)
    # Filter out year numbers (like 2005, 1973) or large numbers to reduce noise
    valid_nums = [n for n in nums if int(n) < 1000 and int(n) > 0]
    # Return unique sorted references
    return sorted(list(set(f"Section/Article {n}" for n in valid_nums)))

#Legal Explainer Bot & Query Interface

In [None]:
def explain_law(query):
    # 1. Retrieve relevant chunks
    results = retrieve(query)

    if not results:
        return "No relevant legal info found in the provided documents."

    top_result = results[0]

    # 2. Calculate Metadata
    # (Ensure extract_refs and confidence_score are defined from previous cells)
    refs = []
    if 'extract_refs' in globals():
        refs = extract_refs(top_result["text"])

    conf = 0
    if 'confidence_score' in globals():
        conf = confidence_score(results, query)

    intent = detect_question_type(query)

    # 3. Generate Explanation using Gemini
    # Use the retrieved text as context for the LLM
    explanation = gemini_translate_with_intent(top_result["text"], intent)

    # 4. Format the Output
    # We trim the raw text to 600 chars to avoid cluttering the screen
    processed_text_snippet = top_result["text"][:600] + "..."

    ref_str = ", ".join(refs[:5]) if refs else "None detected"

    response = f"""
==================================================
QUESTION: {query}
==================================================

ü§ñ GEMINI EXPLANATION:
{explanation}

--------------------------------------------------
üìú SOURCE TEXT (EXCERPT):
"{processed_text_snippet}"

üìÇ SOURCE FILE: {top_result['source']}
‚öñÔ∏è LEGAL REFS: {ref_str}
gf CONFIDENCE: {conf}%
==================================================
"""
    return response

def start_legal_bot():
    print("\n‚öñÔ∏è  LEGAL & POLICY EXPLAINER BOT  ‚öñÔ∏è")
    print("Type 'exit', 'quit', or 'stop' to end the session.\n")

    while True:
        try:
            # Get user input
            user_input = input("Ask a legal question: ").strip()

            # Check for exit keywords
            if user_input.lower() in ["exit", "quit", "stop"]:
                print("\nExiting bot. Goodbye!")
                break

            # Basic validation
            if len(user_input) < 5:
                print("‚ö†Ô∏è  Please enter a longer, more specific question.")
                continue

            # Generate and print response
            print("Thinking...", end="\r")
            answer = explain_law(user_input)
            print(answer)

        except KeyboardInterrupt:
            print("\n\nBot stopped by user.")
            break
        except Exception as e:
            print(f"\n‚ùå An error occurred: {e}")

# --- START THE BOT ---
start_legal_bot()


‚öñÔ∏è  LEGAL & POLICY EXPLAINER BOT  ‚öñÔ∏è
Type 'exit', 'quit', or 'stop' to end the session.


QUESTION: Right to Equality

ü§ñ GEMINI EXPLANATION:
Here's the legal text rewritten in simple language, without altering its legal meaning:

**Article 14: Equality Before the Law**
The State must not deny any person equality before the law or the equal protection of the laws within the territory of India.

**Article 15: Prohibition of Discrimination on Grounds of Religion, Race, Caste, Sex, or Place of Birth**

(1) The State must not discriminate against any citizen based on their religion, race, caste, sex, or place of birth.

(2) No citizen shall, on grounds only of religion, race, caste, sex, or place of birth, be subject to any disability, liability, restriction, or condition regarding:
    (a) access to shops, public restaurants, hotels, and places of public entertainment; or
    (b) the use of wells, tanks, bathing ghats, roads, and places of public resort maintained wholly or par