In [27]:
import os
from pypdf import PdfReader
import spacy
import chromadb
from google import genai

In [None]:
PDF_FOLDER = "privacy-regulation-resources/"
CHROMA_PATH = "./REGULATIONS/chroma_db"
COLLECTION_NAME = "regulations"

In [29]:
# Load SpaCy model (for sentence segmentation)
nlp = spacy.load("en_core_web_sm")



In [None]:
client_gemini = genai.Client()

from chromadb.api.types import EmbeddingFunction

class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts):
        if isinstance(texts, str):
            texts = [texts]

        response = client_gemini.models.embed_content(
            model="models/text-embedding-004",
            contents=texts
        )

        return [e.values for e in response.embeddings]

    def name(self):
        return "gemini-text-embedding-004"



In [31]:
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    pages = []

    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            pages.append({
                "page": i + 1,
                "text": text
            })

    return pages

In [32]:
def clean_text(text):
    return " ".join(text.split())

In [None]:
def chunk_with_spacy(text, chunk_size=800, overlap=100):
    doc = nlp(text)
    chunks = []
    current = ""

    for sent in doc.sents:
        if len(current) + len(sent.text) <= chunk_size:
            current += " " + sent.text
        else:
            chunks.append(current.strip())
            current = current[-overlap:] + " " + sent.text

    if current.strip():
        chunks.append(current.strip())

    return chunks

In [43]:
def chunk_with_spacy(text, chunk_size=800):
    """
    Meaning-preserving chunking:
    - Split using SpaCy sentences
    - Combine sentences into chunks without breaking meaning
    """
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current = ""

    for sent in sentences:
        if len(current) + len(sent) <= chunk_size:
            current += " " + sent
        else:
            chunks.append(current.strip())
            current = sent

    if current:
        chunks.append(current.strip())

    return chunks


In [44]:
all_chunks = []

for file in os.listdir(PDF_FOLDER):
    if not file.endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_FOLDER, file)
    pages = extract_text_from_pdf(pdf_path)

    for page in pages:
        cleaned = clean_text(page["text"])
        chunks = chunk_with_spacy(cleaned)

        for chunk in chunks:
            all_chunks.append({
                "text": chunk,
                "metadata": {
                    "source": file,
                    "page": page["page"]
                }
            })

print(f"✅ Total chunks created: {len(all_chunks)}")

✅ Total chunks created: 1854


In [45]:
all_chunks

[{'text': 'I (Legislativ e acts) REGUL A TIONS REGUL A TION (EU) 2016/679 OF THE EUR OPEAN P ARLIAMENT AND OF THE COUNCIL of 27 Apr il 2016 on the protection of natural persons with regard to the processing of personal data and on the free mo v ement of such dat a, and repealing Directiv e 95/46/EC (General Data Protection Regulation) (T ext with EEA relevance)',
  'metadata': {'source': 'GDPR.pdf', 'page': 1}},
 {'text': 'THE EUR OPEAN P ARLIAMENT AND THE COUNCIL OF THE EUR OPEAN UNION, Ha ving regar d to the T reaty on the Functioning of the European Union, and in par ticular Ar ticle 16 thereof, Ha ving regar d to the proposal from the European Commission, Af ter transmission of the draf t legislative act to the national parliaments, Ha ving regar d to the opinion of the European Economic and Social Committ ee ( 1 ), Ha ving regar d to the opinion of the Committ ee of the Regions ( 2 ), A cting in accordance with the ordinar y legislative procedure ( 3 ), Whereas: (1) The protection

In [34]:
all_chunks = []

for file in os.listdir(PDF_FOLDER):
    if not file.endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_FOLDER, file)
    pages = extract_text_from_pdf(pdf_path)

    for page in pages:
        cleaned = clean_text(page["text"])
        chunks = chunk_with_spacy(cleaned)

        for chunk in chunks:
            all_chunks.append({
                "text": chunk,
                "metadata": {
                    "source": file,
                    "page": page["page"]
                }
            })

print(f"✅ Total chunks created: {len(all_chunks)}")

✅ Total chunks created: 2021


In [42]:
all_chunks

[{'text': 'I (Legislativ e acts) REGUL A TIONS REGUL A TION (EU) 2016/679 OF THE EUR OPEAN P ARLIAMENT AND OF THE COUNCIL of 27 Apr il 2016 on the protection of natural persons with regard to the processing of personal data and on the free mo v ement of such dat a, and repealing Directiv e 95/46/EC (General Data Protection Regulation) (T ext with EEA relevance)',
  'metadata': {'source': 'GDPR.pdf', 'page': 1}},
 {'text': 'a, and repealing Directiv e 95/46/EC (General Data Protection Regulation) (T ext with EEA relevance) THE EUR OPEAN P ARLIAMENT AND THE COUNCIL OF THE EUR OPEAN UNION, Ha ving regar d to the T reaty on the Functioning of the European Union, and in par ticular Ar ticle 16 thereof, Ha ving regar d to the proposal from the European Commission, Af ter transmission of the draf t legislative act to the national parliaments, Ha ving regar d to the opinion of the European Economic and Social Committ ee ( 1 ), Ha ving regar d to the opinion of the Committ ee of the Regions ( 2

In [None]:
embedding_fn = GeminiEmbeddingFunction()

# chroma_client.delete_collection("regulations")

collection = chroma_client.create_collection(
    name="regulations",
    embedding_function=embedding_fn
)



  embedding_fn = GeminiEmbeddingFunction()


In [50]:
BATCH_SIZE = 100

def batched_add(collection, chunks, batch_size=100):
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]

        collection.add(
            documents=[c["text"] for c in batch],
            metadatas=[c["metadata"] for c in batch],
            ids=[f"chunk_{i + j}" for j in range(len(batch))]
        )

        print(f"✅ Added chunks {i} → {i + len(batch)}")


if collection.count() == 0:
    print("📥 Adding documents to Chroma in batches...")
    batched_add(collection, all_chunks, BATCH_SIZE)
    print("🎉 All documents added successfully")
else:
    print(f"ℹ️ Collection already has {collection.count()} documents")


📥 Adding documents to Chroma in batches...
✅ Added chunks 0 → 100
✅ Added chunks 100 → 200
✅ Added chunks 200 → 300
✅ Added chunks 300 → 400
✅ Added chunks 400 → 500
✅ Added chunks 500 → 600
✅ Added chunks 600 → 700
✅ Added chunks 700 → 800
✅ Added chunks 800 → 900
✅ Added chunks 900 → 1000
✅ Added chunks 1000 → 1100
✅ Added chunks 1100 → 1200
✅ Added chunks 1200 → 1300
✅ Added chunks 1300 → 1400
✅ Added chunks 1400 → 1500
✅ Added chunks 1500 → 1600
✅ Added chunks 1600 → 1700
✅ Added chunks 1700 → 1800
✅ Added chunks 1800 → 1854
🎉 All documents added successfully


In [51]:
query_text = "How sensitive is persons' email address?"

results = collection.query(
    query_texts=[query_text],
    n_results=5
)

print("\n🔍 Top Results:\n")

for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(doc[:400])
    print("📄 Source:", meta["source"], "| Page:", meta["page"])
    print("-" * 80)


🔍 Top Results:

For example, using anti-spoofing controls such as Domain-based Message Authentication, Reporting & Conformance (DMARC), Sender Policy Framework (SPF), and Domain Keys Identified Mail (DKIM) will help stop phishers from spoofing the entity’s domain and impersonating personnel. The deployment of technologies for blocking phishing emails and malware before they reach personnel, such as link scrubbers
📄 Source: PCI-DSS.pdf | Page: 126
--------------------------------------------------------------------------------
For example, the need to mitigat e an immediat e r isk of damage w ould call f or promp t communication with data subjects whereas the need to implement appropr iate measures ag ainst continuing or similar personal data breac hes ma y justify more time f or communication. (87) It should be ascer tained whether all appropr iate tec hnological protect ion and org anisational measures hav e been im p
📄 Source: GDPR.pdf | Page: 17
------------------------------------