## Imports and notes

In [None]:
!pip install pdfplumber

In [None]:
!pip install langchain langchain-huggingface langchain-community pypdf langchain_chroma

In [None]:
!pip install chromadb smolagents python-dotenv gradio sentence-transformers

In [None]:
!pip install openai langchain

In [None]:
!pip install --upgrade langchain langchain-community langchain-huggingface chromadb sentence-transformers

In [6]:
# print(os.getcwd())
# base_dir = input("Enter the base directory path for the dataset: ")

# # Verify the path
# print("Base directory set to:", base_dir)

In [None]:
!pip install tiktoken python-dotenv

In [10]:
base_dir = "..\data\Legal-Tactics-Book.zip"

  base_dir = "..\data\Legal-Tactics-Book.zip"


In [11]:
import zipfile
import pdfplumber
import os
from dotenv import load_dotenv
import re
import shutil
from io import BytesIO
import pandas as pd
import numpy as np
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
import torch
torch.set_num_threads(1)

# Load environment variables from .env file
load_dotenv(dotenv_path="../.env")

# Get the API key
openai_api_key = os.getenv("OPENAI_API_KEY")

if openai_api_key is None:
    print("Error: OPENAI_API_KEY not found in .env file.")
    exit()

# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

def extract_zip(uploaded_zip_path, extract_to="../temp_pdfs"):
    """Extracts a zipped folder containing PDFs."""
    os.makedirs(extract_to, exist_ok=True)  # Ensure extraction folder exists

    with zipfile.ZipFile(uploaded_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    pdf_files = []
    for root, _, files in os.walk(extract_to):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))

    return pdf_files

def pdf_to_markdown_string(pdf_path):
    """Extracts structured content while preserving section headers."""
    with pdfplumber.open(pdf_path) as pdf:
        sections = {}
        current_section = None
        skipped_pages = 2  # Skip first two pages bc table of contents extends 

        for page_num, page in enumerate(pdf.pages[skipped_pages:]):  
            text = page.extract_text()
            if not text:
                continue

            lines = text.split("\n")

            for line in lines:
                if re.match(r"^\s*Chapter \d+:?", line) or re.match(r"^[A-Z][A-Z\s]+$", line.strip()):
                    current_section = line.strip()
                    sections[current_section] = sections.get(current_section, "")
                elif current_section:
                    sections[current_section] += line.strip() + "\n"

    # Ensure section headers stay attached to their respective text chunks
    markdown_chunks = [f"## {section}\n\n{content.strip()}\n\n" for section, content in sections.items()]
    return markdown_chunks

def determine_role(text):
    """Assigns a role based on detected keywords. Might need to do a bit more research 
    on keywords themselves to see which ones are the correct keywords to put in each list."""
    tenant_keywords = ["tenant rights", "rent control", "eviction protections", "lease termination"]
    landlord_keywords = ["landlord duties", "property maintenance", "rent collection", "eviction process"]

    if any(keyword in text.lower() for keyword in tenant_keywords):
        return "tenant"
    elif any(keyword in text.lower() for keyword in landlord_keywords):
        return "landlord"
    return "general"  # Default if no clear role is identified

def split_text_with_headers(text_splitter, markdown_sections):
    """Splits text while ensuring section headers remain in context."""
    all_chunks = []

    for section in markdown_sections:
        section_title = section.split("\n")[0]  # Extract the first line (header)
        chunks = text_splitter.split_text(section)

        for chunk in chunks:
            enriched_chunk = f"{section_title}\n\n{chunk}"  # Attach section header to each chunk
            all_chunks.append(enriched_chunk)

    return all_chunks

def load_and_process_pdfs(zip_path):
    """Processes a zipped folder of PDFs, preserves headers, and splits into smaller chunks for vector storage."""
    pdf_files = extract_zip(zip_path)
    all_chunks = []
    
    # Use a text splitter to break sections into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjust chunk size as needed?? Will need to experiment with this and how it interacts with the prompts we try out too - made bigger from 500
        chunk_overlap=200 # Made overlapping size bigger from 50
    )

    for pdf in pdf_files:
        markdown_sections = pdf_to_markdown_string(pdf)

        # Use new method to ensure headers remain attached to their respective text chunks
        enriched_chunks = split_text_with_headers(text_splitter, markdown_sections)

        for chunk in enriched_chunks:
            role = determine_role(chunk)  # Assign role based on text analysis
            all_chunks.append(Document(page_content=chunk, metadata={"source": os.path.basename(pdf), "role": role}))

    return all_chunks

def create_vector_store(chunks, persist_dir: str):
    """Create and persist a Chroma vector store using OpenAI embeddings."""
    
    if os.path.exists(persist_dir):
        print(f"Removing existing vector store from {persist_dir}")
        shutil.rmtree(persist_dir)  # Try commenting this out if issues persist

    # Debugging info
    print(f"Total chunks received for vector store: {len(chunks)}")
    if chunks:
        print(f"Example chunk: {chunks[0].page_content[:300]}")

    try:
        # Initialize OpenAI Embeddings
        embedding_model = OpenAIEmbeddings()

        print("Building and saving the new vector store with OpenAI embeddings...")
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=persist_dir
        )
        return vector_db

    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None  # Return None if an error occurs


def query_vector_store(vector_db, query, role="general"):
    """Finds the most relevant chunks based on the query."""
    """Switching from similarity to MMR bc MMR prioritizes diversity in the results, 
    ensuring a mix of relevant but non-redundant information, 
    whereas similarity search focuses solely on the closest matches."""
     # results = vector_db.similarity_search(query, k=3)  # Retrieve top most relevant results
    results = vector_db.max_marginal_relevance_search(query, k=5) # Might need to play around with K value here
    # Filter results based on role metadata
    filtered_results = [doc.page_content for doc in results if doc.metadata.get("role", "general") == role]
    
    if not filtered_results:  # Fallback if no perfect match is found
        filtered_results = [doc.page_content for doc in results]

    return filtered_results[:3]  # Return top 3 refined results

def main():
    zip_file_path = base_dir 
    vector_db_dir = os.path.join(os.getcwd(), "/chroma_db") # Added slash

    print("Processing PDFs into chunks...")
    document_chunks = load_and_process_pdfs("your_zip_path.zip")
    print(f"Total chunks created: {len(document_chunks)}")
    # Tests
    print(f"Example chunk:\n{document_chunks[0].page_content[:500]}")
    print(f"Metadata: {document_chunks[0].metadata}")

    print("Building the vector store...")
    vector_db = create_vector_store(document_chunks, vector_db_dir)
    print(f"Vector store successfully created at {vector_db_dir}")

    # Example queries
    tenant_query = "What rights do tenants have during eviction?"
    landlord_query = "What obligations do landlords have for maintenance?"

    tenant_results = query_vector_store(vector_db, tenant_query, role="tenant")
    landlord_results = query_vector_store(vector_db, landlord_query, role="landlord")

    print("\nTenant Response:")
    for result in tenant_results:
        print(result[:300])

    print("\nLandlord Response:")
    for result in landlord_results:
        print(result[:300])

if __name__ == "__main__":
    main()

Processing PDFs into chunks...
Generated 2934 document chunks.
Building the vector store...
Removing existing vector store from c:/chroma_db
Total chunks received for vector store: 2934
Example chunk: ## Chapter 17: Condominium Control ▲


  embedding_model = OpenAIEmbeddings()


Building and saving the new vector store with OpenAI embeddings...
Vector store successfully created at c:/chroma_db

Top Matching Results:
1. Legal Tactics: Tenants Rights in
Massachusetts May 2017
372 ▲ Chapter 16: Mobile Homes...
2. Law." This notice must inform you of your
Mobile home park tenants in Massachusetts have rights and be in the exact language
a number of very important rights before contained in the law.
moving into a mobile home park. A park owner
cannot refuse to rent a lot to you if you meet the The park owner m...
3. may become a tenant at will.12
Tenancy by Regulation
If you are a tenant in a mobile home or public or subsidized housing, you are a
tenant by regulation.13 You may have more protections as a tenant.
If you live in a mobile home, see Chapter 16: Mobile Homes.
If you live in public or subsidized hous...


In [None]:
# import sqlite3
# # 'c:/chroma_db/chroma.sqlite3'


# def read_chroma_db(db_file="c:/chroma_db/chroma.sqlite3"): #Default filename
#     try:
#         conn = sqlite3.connect(db_file)
#         cursor = conn.cursor()

#         # Example: Get table names
#         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
#         tables = cursor.fetchall()

#         print("Tables in chroma.sqlite3:")
#         for table in tables:
#             print(table[0])

#         # Example: Read data from the first table (if any)
#         if tables:
#             first_table = tables[0][0]  # Get the name of the first table
#             cursor.execute(f"SELECT * FROM {first_table};")
#             rows = cursor.fetchall()

#             print(f"\nData from {first_table}:")
#             for row in rows:
#                 print(row)

#     except sqlite3.Error as e:
#         print(f"An error occurred: {e}")

#     finally:
#         if conn:
#             conn.close()

# read_chroma_db() #uses the default filename.