## Imports and notes

In [1]:
!pip install pdfplumber



In [2]:
!pip install langchain langchain-huggingface langchain-community pypdf langchain_chroma



In [3]:
!pip install chromadb smolagents python-dotenv gradio sentence-transformers



In [4]:
!pip install openai langchain



In [5]:
!pip install --upgrade langchain langchain-community langchain-huggingface chromadb sentence-transformers



In [6]:
# print(os.getcwd())
# base_dir = input("Enter the base directory path for the dataset: ")

# # Verify the path
# print("Base directory set to:", base_dir)

In [7]:
!pip install tiktoken python-dotenv



In [10]:
base_dir = "..\data\Legal-Tactics-Book.zip"

  base_dir = "..\data\Legal-Tactics-Book.zip"


In [11]:
import zipfile
import pdfplumber
import os
from dotenv import load_dotenv
import re
import shutil
from io import BytesIO
import pandas as pd
import numpy as np
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
import torch
torch.set_num_threads(1)

# Load environment variables from .env file
load_dotenv(dotenv_path="../.env")

# Get the API key
openai_api_key = os.getenv("OPENAI_API_KEY")

if openai_api_key is None:
    print("Error: OPENAI_API_KEY not found in .env file.")
    exit()

# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

def extract_zip(uploaded_zip_path, extract_to="../temp_pdfs"):
    """Extracts a zipped folder containing PDFs."""
    os.makedirs(extract_to, exist_ok=True)  # Ensure extraction folder exists

    with zipfile.ZipFile(uploaded_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    pdf_files = []
    for root, _, files in os.walk(extract_to):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))

    return pdf_files

def pdf_to_markdown_string(pdf_path):
    """Extracts structured content from a PDF and chunks by major sections."""
    with pdfplumber.open(pdf_path) as pdf:
        sections = {}
        current_section = None
        first_page_skipped = False

        for page_num, page in enumerate(pdf.pages):
            if not first_page_skipped:
                first_page_skipped = True
                continue  # Skip the first page (assumed to be TOC)

            text = page.extract_text()
            if not text:
                continue

            lines = text.split("\n")

            for line in lines:
                # Detecting major section headings using regex patterns
                # Detects section headings based on patterns like "Chapter X", bold text, or all-uppercase words
                if re.match(r"^\s*Chapter \d+:?", line) or re.match(r"^[A-Z][A-Z\s]+$", line.strip()):
                    current_section = line.strip()
                    sections[current_section] = sections.get(current_section, "")
                elif current_section:
                    sections[current_section] += line.strip() + "\n"

    # Convert to a list of markdown chunks
    markdown_chunks = []
    for section, content in sections.items():
        markdown_chunks.append(f"## {section}\n\n{content.strip()}\n\n")

    return markdown_chunks

def load_and_process_pdfs(zip_path):
    """Processes a zipped folder of PDFs and splits into smaller chunks for vector storage."""
    pdf_files = extract_zip(zip_path)
    all_chunks = []

    for pdf in pdf_files:
        markdown_sections = pdf_to_markdown_string(pdf)
        
        # Use a text splitter to break sections into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Adjust chunk size as needed?? Will need to experiment with this and how it interacts with the prompts we try out too
            chunk_overlap=50 # Made overlapping size smaller 
        )

        for section in markdown_sections:
            chunks = text_splitter.split_text(section)
            for chunk in chunks:
                all_chunks.append(Document(page_content=chunk, metadata={"source": os.path.basename(pdf)}))

    return all_chunks

def create_vector_store(chunks, persist_dir: str):
    """Create and persist a Chroma vector store using OpenAI embeddings."""
    
    if os.path.exists(persist_dir):
        print(f"Removing existing vector store from {persist_dir}")
        shutil.rmtree(persist_dir)  # Try commenting this out if issues persist

    # Debugging info
    print(f"Total chunks received for vector store: {len(chunks)}")
    if chunks:
        print(f"Example chunk: {chunks[0].page_content[:300]}")

    try:
        # Initialize OpenAI Embeddings
        embedding_model = OpenAIEmbeddings()

        print("Building and saving the new vector store with OpenAI embeddings...")
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=persist_dir
        )
        return vector_db

    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None  # Return None if an error occurs


def query_vector_store(vector_db, query):
    """Finds the most relevant chunks based on the query."""
    results = vector_db.similarity_search(query, k=3)  # Retrieve top 3 most relevant results
    return [doc.page_content for doc in results]

def main():
    zip_file_path = base_dir 
    vector_db_dir = os.path.join(os.getcwd(), "/chroma_db") # Added slash

    print("Processing PDFs into chunks...")
    document_chunks = load_and_process_pdfs(zip_file_path)
    print(f"Generated {len(document_chunks)} document chunks.")

    print("Building the vector store...")
    vector_db = create_vector_store(document_chunks, vector_db_dir)
    print(f"Vector store successfully created at {vector_db_dir}")

    # Example query
    user_query = "What are the legal protections for mobile home tenants in Massachusetts?"
    results = query_vector_store(vector_db, user_query)

    print("\nTop Matching Results:")
    for idx, result in enumerate(results, 1):
        print(f"{idx}. {result[:300]}...")  # Display first 300 chars
        
        
if __name__ == "__main__":
    main()

Processing PDFs into chunks...
Generated 2934 document chunks.
Building the vector store...
Removing existing vector store from c:/chroma_db
Total chunks received for vector store: 2934
Example chunk: ## Chapter 17: Condominium Control ▲


  embedding_model = OpenAIEmbeddings()


Building and saving the new vector store with OpenAI embeddings...
Vector store successfully created at c:/chroma_db

Top Matching Results:
1. Legal Tactics: Tenants Rights in
Massachusetts May 2017
372 ▲ Chapter 16: Mobile Homes...
2. Law." This notice must inform you of your
Mobile home park tenants in Massachusetts have rights and be in the exact language
a number of very important rights before contained in the law.
moving into a mobile home park. A park owner
cannot refuse to rent a lot to you if you meet the The park owner m...
3. may become a tenant at will.12
Tenancy by Regulation
If you are a tenant in a mobile home or public or subsidized housing, you are a
tenant by regulation.13 You may have more protections as a tenant.
If you live in a mobile home, see Chapter 16: Mobile Homes.
If you live in public or subsidized hous...


In [None]:
# import sqlite3
# # 'c:/chroma_db/chroma.sqlite3'


# def read_chroma_db(db_file="c:/chroma_db/chroma.sqlite3"): #Default filename
#     try:
#         conn = sqlite3.connect(db_file)
#         cursor = conn.cursor()

#         # Example: Get table names
#         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
#         tables = cursor.fetchall()

#         print("Tables in chroma.sqlite3:")
#         for table in tables:
#             print(table[0])

#         # Example: Read data from the first table (if any)
#         if tables:
#             first_table = tables[0][0]  # Get the name of the first table
#             cursor.execute(f"SELECT * FROM {first_table};")
#             rows = cursor.fetchall()

#             print(f"\nData from {first_table}:")
#             for row in rows:
#                 print(row)

#     except sqlite3.Error as e:
#         print(f"An error occurred: {e}")

#     finally:
#         if conn:
#             conn.close()

# read_chroma_db() #uses the default filename.