# 🗄️ GDPR Compliance Agent - Notebook 4: Reusable Pinecone Upload for New Documents

This notebook modularizes the pipeline to process new GDPR-related PDFs and upload them to Pinecone. It combines and generalizes the workflows from `01_text_pdf_processing_document-2.ipynb` and `02_pinecone_embeddings-2.ipynb` for future reuse.

- **What it does**:
  - Extracts text and metadata from one or more PDFs
  - Chunks text using optimized parameters for legal documents
  - Optionally estimates embedding cost
  - Uploads chunks to Pinecone with OpenAI embeddings
  - Provides a simple retrieval verification

- **How to use**:
  - Put your PDFs in `2_data/raw/`
  - Set `PDF_PATHS` in the config cell below (single or multiple files)
  - Ensure `.env` contains `OPENAI_API_KEY` and `PINECONE_API_KEY`
  - Run cells from top to bottom


In [1]:
# Cell 1: Setup and Imports
import os
import sys
import time
import pickle
from typing import List, Dict, Tuple, Optional

# Add project root to Python path
sys.path.append(os.path.abspath('..'))

# Third-party libraries
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

from pinecone import Pinecone, ServerlessSpec


# Helper utilities
from src.embedding_cost_calculator import calculate_embedding_cost, quick_cost

print("✅ Libraries imported successfully!")


def init_environment() -> Tuple[Optional[str], Optional[str]]:
    """Load environment variables once and return keys."""
    load_dotenv()
    openai_key = os.getenv('OPENAI_API_KEY')
    pinecone_key = os.getenv('PINECONE_API_KEY')
    return openai_key, pinecone_key


✅ Libraries imported successfully!



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
# Cell 2: Environment and Configuration
OPENAI_API_KEY, PINECONE_API_KEY = init_environment()

# User-configurable parameters
SINGLE_PDF_PATH: str = "../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf"  # Select ONE specific PDF
DOCUMENT_TYPE: str = "bitkom_ai_gdpr_handbook"  # Set the metadata category for this document

INDEX_NAME: str = "gdpr-compliance-openai"  # Reuse existing or create new
EMBEDDING_MODEL: str = "text-embedding-3-small"
CHUNK_SIZE: int = 800
CHUNK_OVERLAP: int = 120

# Preview-first flags
PREVIEW_ONLY: bool = True   # If True, process and preview chunks; skip upload
PREVIEW_NUM_CHUNKS: int = 5 # Show this many chunks in preview


In [3]:
# Configure environment (single initialization already done)
# Keys are set via init_environment() in Cell 1; avoid prompting or re-setting.
print("🔑 API keys configured via init_environment()")
print(f"📁 Using Pinecone index: {INDEX_NAME}")

🔑 API keys configured via init_environment()
📁 Using Pinecone index: gdpr-compliance-openai


In [4]:
# Verify environment variables and print config
print("🔑 Environment Configuration:")
print(f"   OpenAI API Key: {'✅' if OPENAI_API_KEY else '❌'}")
print(f"   Pinecone API Key: {'✅' if PINECONE_API_KEY else '❌'}")
print("🧩 Config:")
print(f"   SINGLE_PDF_PATH: {SINGLE_PDF_PATH}")
print(f"   DOCUMENT_TYPE: {DOCUMENT_TYPE}")
print(f"   INDEX_NAME: {INDEX_NAME}")
print(f"   MODEL: {EMBEDDING_MODEL}")
print(f"   CHUNK_SIZE/OVERLAP: {CHUNK_SIZE}/{CHUNK_OVERLAP}")
print(f"   PREVIEW_ONLY: {PREVIEW_ONLY}")
print(f"   PREVIEW_NUM_CHUNKS: {PREVIEW_NUM_CHUNKS}")

if not all([OPENAI_API_KEY, PINECONE_API_KEY]):
    print("⚠️  Missing environment variables! Please set in your .env file.")


🔑 Environment Configuration:
   OpenAI API Key: ✅
   Pinecone API Key: ✅
🧩 Config:
   SINGLE_PDF_PATH: ../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf
   DOCUMENT_TYPE: bitkom_ai_gdpr_handbook
   INDEX_NAME: gdpr-compliance-openai
   MODEL: text-embedding-3-small
   CHUNK_SIZE/OVERLAP: 800/120
   PREVIEW_ONLY: True
   PREVIEW_NUM_CHUNKS: 5


In [5]:
# Cell 3: PDF Extraction Utilities

def categorize_content(text: str) -> str:
    text_lower = text.lower()
    if any(k in text_lower for k in ['kunde', 'customer', 'marketing']):
        return "customer_data"
    if any(k in text_lower for k in ['mitarbeiter', 'employee', 'personal']):
        return "employee_data"
    if any(k in text_lower for k in ['recht', 'law', 'gesetz', 'dsgvo']):
        return "legal_basis"
    if any(k in text_lower for k in ['sicherheit', 'security', 'datenschutzverletzung']):
        return "security"
    if any(k in text_lower for k in ['speicherung', 'retention', 'aufbewahrung']):
        return "data_retention"
    return "general"


def identify_section_type(text: str) -> str:
    text = text.strip()
    if len(text) < 200 and any(ind in text for ind in ['KAPITEL', 'ARTIKEL', 'SECTION']):
        return "section_header"
    if len(text) < 100 and text.isupper():
        return "heading"
    return "content"


def extract_pdf_with_metadata(pdf_path: str) -> List[Document]:
    print(f"📄 Extracting from: {pdf_path}")
    if not os.path.exists(pdf_path):
        print(f"❌ File not found: {pdf_path}")
        return []

    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    if not docs:
        print("❌ No pages extracted from PDF.")
        return []

    print(f"✅ Successfully extracted {len(docs)} pages")

    enhanced_docs: List[Document] = []
    for i, doc in enumerate(docs):
        keys_to_remove = ['producer', 'creator']
        clean_metadata: Dict[str, str] = {}
        for key, value in doc.metadata.items():
            if key not in keys_to_remove:
                clean_metadata[key] = value

        custom_metadata = {
            "document_type": DOCUMENT_TYPE,
            "document_name": os.path.basename(pdf_path),
            "language": "german",
            "source": pdf_path,
            "page_number": i + 1,
            "total_pages": len(docs),
            "content_length": len(doc.page_content),
            "content_category": categorize_content(doc.page_content),
            "section_type": identify_section_type(doc.page_content),
        }
        final_metadata = {**custom_metadata, **clean_metadata}
        enhanced_docs.append(Document(page_content=doc.page_content, metadata=final_metadata))

    if enhanced_docs:
        print("\n📋 First page sample:")
        print(enhanced_docs[0].page_content[:200] + "...")
        print(f"📊 Metadata: {enhanced_docs[0].metadata}")

    return enhanced_docs


def extract_single_pdf(pdf_path: str) -> List[Document]:
    return extract_pdf_with_metadata(pdf_path)



In [6]:
# Cell 4: Chunking and Cost Utilities

def create_optimized_splitter(chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP) -> RecursiveCharacterTextSplitter:
    return RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
        length_function=len,
    )


def chunk_documents(documents: List[Document]) -> List[Document]:
    splitter = create_optimized_splitter()
    chunks = splitter.split_documents(documents)

    # annotate chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata.update({
            "chunk_id": i + 1,
            "chunk_size": len(chunk.page_content),
            "total_chunks": len(chunks),
        })

    print("✂️ Chunking Results:")
    print(f"   Input documents: {len(documents)}")
    print(f"   Output chunks: {len(chunks)}")

    return chunks


def estimate_embedding_cost_for_chunks(chunks: List[Document]) -> Dict[str, float]:
    texts = [c.page_content for c in chunks]
    return calculate_embedding_cost(texts)


In [7]:
# Cell 5: Pinecone Initialization and Upload Utilities

# ---------------------------
# Pinecone Initialization
# ---------------------------
def init_pinecone(api_key: str, index_name: str = "gdpr-compliance-openai", environment: str = "us-east-1"):
    """Initialize Pinecone connection using guard clauses only."""
    if not api_key:
        print("❌ PINECONE_API_KEY is missing!")
        return None, None

    print("🔌 Initializing Pinecone...")
    pc = Pinecone(api_key=api_key)
    print("✅ Pinecone initialized successfully")

    names = pc.list_indexes().names()
    if index_name in names:
        print(f"✅ Index '{index_name}' exists")
        status = pc.describe_index(index_name).status
        if hasattr(status, 'ready') and not status.ready:
            print("⏳ Waiting for index to be ready...")
            while not pc.describe_index(index_name).status.ready:
                time.sleep(1)
    else:
        print(f"⚠️  Index '{index_name}' not found.")

    index = pc.Index(index_name) if index_name in names else None
    return pc, index

def create_vectorstore_and_upload(chunks: List[Document], index_name: str = INDEX_NAME, embedding_model: str = EMBEDDING_MODEL):
    print("🔄 Creating Pinecone vector store...")
    embeddings = OpenAIEmbeddings(model=embedding_model)
    vectorstore = PineconeVectorStore.from_documents(
        documents=chunks,
        embedding=embeddings,
        index_name=index_name
    )
    print(f"✅ Successfully loaded {len(chunks)} documents into Pinecone")
    total_chars = sum(len(c.page_content) for c in chunks)
    print(f"📊 Stats: {len(chunks)} chunks, ~{total_chars} characters")
    return vectorstore

In [13]:
# Cell 6: Main Runner - Process and Optional Upload

def process_and_optionally_upload(single_pdf_path: str, preview_only: bool = PREVIEW_ONLY) -> Tuple[List[Document], List[Document], Optional[PineconeVectorStore]]:
    # 1) Extract one PDF
    docs = extract_single_pdf(single_pdf_path)
    if not docs:
        print("❌ No documents extracted. Exiting.")
        return [], [], None

    # 2) Chunk
    chunks = chunk_documents(docs)

    # 3) Cost estimation (optional)
    cost_usd = estimate_embedding_cost_for_chunks(chunks)
    if isinstance(cost_usd, (int, float)):
        print("\n📊 Embedding Cost (estimate)")
        print(f"   Model: {EMBEDDING_MODEL}")
        print(f"   Texts: {len(chunks)}")
        print(f"   Estimated cost: ${cost_usd:.6f}")

    # 4) Preview-first guard
    if preview_only:
        print("\n🛑 PREVIEW_ONLY=True -> Skipping Pinecone upload. Inspect chunks below.")
        return docs, chunks, None

    # 5) Pinecone upload (when preview_only=False)
    pc, index = init_pinecone(PINECONE_API_KEY, INDEX_NAME)
    if not index:
        print("❌ Pinecone not available or index missing. Exiting before upload.")
        return docs, chunks, None

    vectorstore = create_vectorstore_and_upload(chunks, INDEX_NAME, EMBEDDING_MODEL)
    return docs, chunks, vectorstore

In [16]:
# Run the pipeline in preview mode by default
PREVIEW_ONLY = True
DOCS, CHUNKS, VECTORSTORE = process_and_optionally_upload(SINGLE_PDF_PATH, PREVIEW_ONLY)

📄 Extracting from: ../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf
✅ Successfully extracted 57 pages

📋 First page sample:
Künstliche 
Intelligenz & 
Datenschutz 
Praxisleitfaden Version 2.0 | Neuauflage...
📊 Metadata: {'document_type': 'bitkom_ai_gdpr_handbook', 'document_name': 'bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'language': 'german', 'source': '../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'page_number': 1, 'total_pages': 57, 'content_length': 80, 'content_category': 'general', 'section_type': 'content', 'creationdate': '2025-08-12T09:15:52+02:00', 'title': 'Praxisleitfaden KI & Datenschutz', 'author': 'Isabelle Stroot', 'subject': 'KI-Datenschutz', 'moddate': '2025-08-12T09:15:52+02:00', 'page': 0, 'page_label': '1'}
✂️ Chunking Results:
   Input documents: 57
   Output chunks: 249
📊 Cost Calculation:
   - Number of texts: 249
   - Total tokens: 51120
   - Model: text

In [17]:
# Cell 7: Preview - Inspect First Chunks Before Upload
def preview_chunks(chunks: List[Document], num: int = PREVIEW_NUM_CHUNKS):
    if not chunks:
        print("⚠️  No chunks to preview.")
        return
    print(f"📋 Showing first {min(num, len(chunks))} chunk(s):")
    for i in range(min(num, len(chunks))):
        c = chunks[i]
        meta = {k: c.metadata.get(k) for k in ["document_name", "page_number", "content_category", "chunk_id", "chunk_size"]}
        print(f"\n--- Chunk {i+1} ---")
        print(f"Meta: {meta}")
        print(c.page_content[:400] + ("..." if len(c.page_content) > 400 else ""))


# Run chunk preview
preview_chunks(CHUNKS, PREVIEW_NUM_CHUNKS)

# Optional: similarity search only runs after upload
def preview_search(vectorstore: Optional[PineconeVectorStore], queries: List[str], k: int = 2):
    if vectorstore is None:
        print("(Skipping retrieval test — no upload performed in PREVIEW_ONLY mode)")
        return
    print("🧪 Testing Vector Store Retrieval...")
    for query in queries:
        print(f"\n🔍 Query: '{query}'")
        results = vectorstore.similarity_search(query, k=k)
        print(f"   Found {len(results)} relevant chunks:")
        for i, doc in enumerate(results):
            print(f"   {i+1}. {doc.page_content[:150]}...")
        print("   " + "─" * 50)

TEST_QUERIES = [
    "Was ist die Datenschutzrichtlinie?",
    "Wie sollen Kundendaten behandelt werden?",
]

preview_search(VECTORSTORE, TEST_QUERIES, k=2)


📋 Showing first 5 chunk(s):

--- Chunk 1 ---
Meta: {'document_name': 'bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'page_number': 1, 'content_category': 'general', 'chunk_id': 1, 'chunk_size': 80}
Künstliche 
Intelligenz & 
Datenschutz 
Praxisleitfaden Version 2.0 | Neuauflage

--- Chunk 2 ---
Meta: {'document_name': 'bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'page_number': 2, 'content_category': 'legal_basis', 'chunk_id': 2, 'chunk_size': 789}
Praxisleitfaden KI & Datenschutz 
2 
Inhalt 
Geleitwort 4 
1 Ziel des Leitfadens 5 
Wann sprechen wir überhaupt von Künstlicher Intelligenz? 6 
Ethischer Rahmen: Vertrauenswürdige KI-Gestaltung strategisch 
verankern und umsetzen 8 
Rechtsrahmen beim Einsatz von KI 9 
2 Checkliste zum datenschutzkonformen Einsatz von KI 12 
Training eigener KI-Modelle und Systeme 13 
Nutzung von KI-Systemen und Mo...

--- Chunk 3 ---
Meta: {'document_name': 'bitkom-leitfaden-kuenstliche-intelligenz-und-d

In [18]:
# Cell 8: Upload to Pinecone (Run after verifying chunks)

# Set preview flag to False to enable upload
PREVIEW_ONLY = False

# Run end-to-end with upload
DOCS, CHUNKS, VECTORSTORE = process_and_optionally_upload(SINGLE_PDF_PATH, PREVIEW_ONLY)

# Optional: quick retrieval sanity check after upload
# preview_search(VECTORSTORE, TEST_QUERIES, k=2)


📄 Extracting from: ../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf
✅ Successfully extracted 57 pages

📋 First page sample:
Künstliche 
Intelligenz & 
Datenschutz 
Praxisleitfaden Version 2.0 | Neuauflage...
📊 Metadata: {'document_type': 'bitkom_ai_gdpr_handbook', 'document_name': 'bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'language': 'german', 'source': '../2_data/raw/bitkom-leitfaden-kuenstliche-intelligenz-und-datenschutz-auflage-2.pdf', 'page_number': 1, 'total_pages': 57, 'content_length': 80, 'content_category': 'general', 'section_type': 'content', 'creationdate': '2025-08-12T09:15:52+02:00', 'title': 'Praxisleitfaden KI & Datenschutz', 'author': 'Isabelle Stroot', 'subject': 'KI-Datenschutz', 'moddate': '2025-08-12T09:15:52+02:00', 'page': 0, 'page_label': '1'}
✂️ Chunking Results:
   Input documents: 57
   Output chunks: 249
📊 Cost Calculation:
   - Number of texts: 249
   - Total tokens: 51120
   - Model: text