# GDPR Compliance Assistant - RAG Agent Implementation

This notebook implements the QA agent for the GDPR Compliance Assistant using your existing Pinecone vector database.



## Setup and Imports

First, let's install required packages and import dependencies.

In [1]:
# First, make sure you have the latest LangChain
# pip install langchain-core langchain-openai

# Cell 1: Setup and Imports
import os
import sys
from dotenv import load_dotenv

# Add project root to Python path
sys.path.append(os.path.abspath('..'))

# LangChain components
from langchain_openai import OpenAIEmbeddings, ChatOpenAI  # ✅ Correct imports
from langchain_pinecone import PineconeVectorStore  # ✅ Pinecone integration
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# from langchain.chains import RetrievalQA
# from langchain.vectorstores import Pinecone
# from langchain.embeddings import OpenAIEmbeddings
# from langchain_openai import ChatOpenAI
# from langchain.prompts import PromptTemplate
# from langchain_pinecone import PineconeVectorStore

# Load environment variables
load_dotenv()

print("✅ All packages imported successfully!")

✅ All packages imported successfully!



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore



## Configuration / Environment Setup

Set up your API keys and configuration. Replace with your actual values.

In [2]:
# Configure your API keys
def setup_environment():
    # Check if API keys are already in environment
    openai_key = os.getenv("OPENAI_API_KEY")
    pinecone_key = os.getenv("PINECONE_API_KEY")
    
    # If not set, prompt user
    if not openai_key:
        openai_key = getpass.getpass("Enter your OpenAI API key: ")
        os.environ["OPENAI_API_KEY"] = openai_key
    
    if not pinecone_key:
        pinecone_key = getpass.getpass("Enter your Pinecone API key: ")
        os.environ["PINECONE_API_KEY"] = pinecone_key
    
    # Your Pinecone index name (replace with your actual index name)
    index_name = "gdpr-compliance-openai"  # Change this to your index name
    
    return index_name

index_name = setup_environment()
print(f"🔑 API keys configured")
print(f"📁 Using Pinecone index: {index_name}")

🔑 API keys configured
📁 Using Pinecone index: gdpr-compliance-openai


In [3]:
# ---------------------------
# Pinecone Initialization (Current 2025 syntax)
# ---------------------------
def init_pinecone(api_key: str, index_name: str = "gdpr-assistant", environment: str = "us-east-1"):
    """
    Initialize Pinecone connection using current Pinecone v7.x+ API
    """
    if not api_key:
        raise ValueError("PINECONE_API_KEY is missing!")
    
    # Initialize Pinecone (Current API)
    print("🔌 Initializing Pinecone...")
    from pinecone import Pinecone, ServerlessSpec
    pc = Pinecone(api_key=api_key)
    print("✅ Pinecone initialized successfully")
    
    # Check if index exists
    if index_name in pc.list_indexes().names():
        print(f"✅ Index '{index_name}' exists")
        # Wait for index to be ready
        while not pc.describe_index(index_name).status.ready:
            print("⏳ Waiting for index to be ready...")
            import time
            time.sleep(1)
    else:
        print(f"⚠️  Index '{index_name}' not found. Creating it...")
        pc.create_index(
            name=index_name,
            dimension=1536,  # OpenAI text-embedding-3-small dimension
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region=environment)
        )
        print(f"✅ Index '{index_name}' created")
    
    # Get the index object
    index = pc.Index(index_name)
    return pc, index

# Initialize Pinecone with your settings
try:
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    pc, index = init_pinecone(
        api_key=pinecone_api_key,
        index_name=index_name,
        environment="us-east-1"
    )
    print("✅ Pinecone setup completed!")
    
except Exception as e:
    print(f"❌ Error initializing Pinecone: {e}")

🔌 Initializing Pinecone...
✅ Pinecone initialized successfully
✅ Index 'gdpr-compliance-openai' exists
✅ Pinecone setup completed!


In [None]:
# # ---------------------------
# # Pinecone Initialization (Adapted from colleague's code)
# # ---------------------------
# def init_pinecone(api_key: str, index_name: str = "gdpr-assistant", environment: str = "us-east-1"):
#     """
#     Initialize Pinecone connection using the new Pinecone v7.x API
#     Adapted from colleague's research-papers project
#     """
#     if not api_key:
#         raise ValueError("PINECONE_API_KEY is missing!")
    
#     # Initialize Pinecone (NEW API - Version 7.x)
#     print("🔌 Initializing Pinecone...")
#     from pinecone import Pinecone
#     pc = Pinecone(api_key=api_key)
#     print("✅ Pinecone initialized successfully")
    
#     # Check if index exists
#     if index_name in pc.list_indexes().names():
#         print(f"✅ Index '{index_name}' exists")
#         # Wait for index to be ready
#         while not pc.describe_index(index_name).status.ready:
#             print("⏳ Waiting for index to be ready...")
#             import time
#             time.sleep(1)
#     else:
#         print(f"⚠️  Index '{index_name}' not found. Creating it...")
#         pc.create_index(
#             name=index_name,
#             dimension=1536,  # OpenAI text-embedding-3-small dimension
#             metric="cosine",
#             spec=ServerlessSpec(cloud="aws", region=environment)
#         )
#         print(f"✅ Index '{index_name}' created")
    
#     # Get the index object
#     index = pc.Index(index_name)
#     return pc, index  # Return both client and index

# # Initialize Pinecone with your settings
# try:
#     pinecone_api_key = os.getenv("PINECONE_API_KEY")
#     pc, index = init_pinecone(
#         api_key=pinecone_api_key,
#         index_name=index_name,
#         environment="us-east-1"  # Adjust if your index is in different region
#     )
#     print("✅ Pinecone setup completed!")
    
# except Exception as e:
#     print(f"❌ Error initializing Pinecone: {e}")

🔌 Initializing Pinecone...
✅ Pinecone initialized successfully
✅ Index 'gdpr-compliance-openai' exists
✅ Pinecone setup completed!


## Initialize embeddings

In [5]:
# Initialize embeddings with CURRENT syntax - NO DEPRECATION WARNING
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)
print("✅ Embeddings initialized successfully")

✅ Embeddings initialized successfully


## Initialize Vector Store Connection

In [4]:
index_name

'gdpr-compliance-openai'

In [7]:
vector_store = PineconeVectorStore(
        index=index,  # Use the index object from our initialization
        embedding=embeddings,
        text_key="text"  # This should match your upload metadata field name
    )
    
print("✅ LangChain successfully connected to Pinecone index!")

✅ LangChain successfully connected to Pinecone index!


## Test the connection with current syntax


In [8]:
# Test the connection with current syntax
test_results = vector_store.similarity_search("Datenschutz", k=2)
print(f"📚 Test retrieval found {len(test_results)} documents")

# Show metadata structure (useful for debugging)
if test_results:
    print(f"📋 Available metadata fields: {list(test_results[0].metadata.keys())}")
    print(f"📄 Sample content: {test_results[0].page_content[:150]}...")
    
# Alternative: Check what's in the vector store
print(f"\n🔍 Vector store type: {type(vector_store)}")

📚 Test retrieval found 2 documents
📋 Available metadata fields: ['author', 'chunk_id', 'chunk_size', 'content_category', 'content_length', 'creationdate', 'document_name', 'document_type', 'language', 'moddate', 'page', 'page_label', 'page_number', 'section_type', 'source', 'total_chunks', 'total_pages']
📄 Sample content: Leitfaden 
Datenschutzrecht 
Was Betriebe zu beachten haben 
 
 
Stand: November 2020 
 
Abteilung Organisation und Recht...

🔍 Vector store type: <class 'langchain_pinecone.vectorstores.PineconeVectorStore'>


## Verify Data and Create Retriever

In [9]:
# Verify data and create retriever with current syntax
print("🔍 Setting up retriever...")

# Create retriever with current syntax
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3,  # Number of documents to retrieve
        "score_threshold": 0.7  # Optional: minimum similarity score
    }
)

print("✅ Retriever configured!")
print(f"   - Search type: similarity")
print(f"   - k: 3 documents")
print(f"   - score_threshold: 0.7")

# Test the retriever
print("\n🧪 Testing retriever...")
test_docs = retriever.invoke("Datenverarbeitung Grundsätze")
print(f"✅ Retriever test successful - found {len(test_docs)} documents")

🔍 Setting up retriever...
✅ Retriever configured!
   - Search type: similarity
   - k: 3 documents
   - score_threshold: 0.7

🧪 Testing retriever...
✅ Retriever test successful - found 3 documents


## Current LLM Setup

In [10]:
# Initialize LLM with current syntax
print("🚀 Initializing GPT-5 Nano LLM...")

llm = ChatOpenAI(
    model="gpt-5-nano",
    temperature=0.1,
    max_tokens=500,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

print("✅ LLM initialized with current syntax!")
print(f"   - Model: gpt-5-nano")
print(f"   - Temperature: 0.1") 
print(f"   - Max tokens: 500")

🚀 Initializing GPT-5 Nano LLM...
✅ LLM initialized with current syntax!
   - Model: gpt-5-nano
   - Temperature: 0.1
   - Max tokens: 500


## Create QA Chain

In [14]:
# Create prompt template and QA chain with current syntax
print("🔗 Creating QA chain...")

# Current prompt template
prompt_template = """Du bist ein spezialisierter Assistent für Datenschutzfragen für Handwerksbetriebe
.

Antworte AUF DEUTSCH basierend auf dem bereitgestellten Kontext. 
Sei präzise und fokussiere auf die praktische Umsetzung für Handwerksbetriebe.

Kontext: {context}

Frage: {question}

Antwort (deutsch, präzise, praxisorientiert):"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

# Create QA chain with current syntax
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("✅ QA chain created successfully!")

🔗 Creating QA chain...
✅ QA chain created successfully!


## Create a helper function to test the agent and display results.


In [15]:
def ask_gdpr_question(question, show_sources=True):
    """
    Ask a question to the GDPR assistant and display the response with sources.
    
    Args:
        question (str): The question to ask (in German or English)
        show_sources (bool): Whether to display source documents
    
    Returns:
        dict: Complete result with answer and source documents
    """
    print(f"❓ Frage: {question}")
    print("⏳ Denke nach...")
    
    # Get answer from QA chain
    result = qa_chain.invoke({"query": question})

    # Check if we got a valid answer
    answer = result.get('result', '').strip()
    
    print(f"✅ Antwort: {result['result']}")
    
    # Show source documents if requested
    if show_sources and result['source_documents']:
        print(f"\n📚 Verwendete Quellen ({len(result['source_documents'])}):")
        for i, doc in enumerate(result['source_documents']):
            source_text = doc.page_content.replace('\n', ' ').strip()
            print(f"   {i+1}. {source_text[:150]}...")
    
    print("―" * 80)
    return result


## Test the RAG System

Now let's test the system with various GDPR questions.


In [16]:
# Test 2: Data retention periods
print("🧪 TEST 2: Aufbewahrungsfristen")
result2 = ask_gdpr_question("Wie lange dürfen Kundendaten gespeichert werden?")

🧪 TEST 2: Aufbewahrungsfristen
❓ Frage: Wie lange dürfen Kundendaten gespeichert werden?
⏳ Denke nach...
✅ Antwort: 

📚 Verwendete Quellen (3):
   1. Gesetzliche Löschfristen     In vereinzelten Fällen schreiben gesetzliche Regelungen vor, wann bestimmte Daten zu lö- schen sind (für eine Ü bersicht ...
   2. Ob und wann die Aufbewahrung von Daten nicht mehr erforderlich ist, liegt grundsätzlich im  Ermessen des Dateninhabers, also des Handwerksbetriebs, de...
   3. benötigt, schreiben zahlreichliche gesetzliche Regelungen vor, dass bestimmte Daten min- destens für einen konkreten Zeitraum aufzubewahren sind. Solc...
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――


In [None]:
# Test 1: Basic GDPR principles
print("🧪 TEST 1: Grundlegende Datenschutzgrundsätze")
result1 = ask_gdpr_question("Was sind die Grundsätze der Datenverarbeitung im Handwerk?")


In [None]:
zzz
# Test 3: Data breach procedures
print("🧪 TEST 3: Datenpannen")
result3 = ask_gdpr_question("Was muss ich tun bei einer Datenschutzverletzung?")


In [None]:
zz
# Test 4: Employee data
print("🧪 TEST 4: Mitarbeiterdaten")
result4 = ask_gdpr_question("Welche Regeln gelten für die Verarbeitung von Mitarbeiterdaten?")


===
----

----

In [None]:


```python
# Install required packages (run once)
# !pip install langchain langchain-community langchain-openai python-dotenv pinecone-client
```

```python
# Import dependencies
import os
import getpass
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

# For environment variables
from dotenv import load_dotenv
load_dotenv()
```

## Configuration

Set up your API keys and configuration. Replace with your actual values.

```python
# Configure API keys
if "OPENAI_API_KEY" not in os.environ:
    openai_key = getpass.getpass("Enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = openai_key

if "PINECONE_API_KEY" not in os.environ:
    pinecone_key = getpass.getpass("Enter your Pinecone API key: ")
    os.environ["PINECONE_API_KEY"] = pinecone_key

# Your Pinecone index name (replace with your actual index name)
INDEX_NAME = "gdpr-assistant"  # Change this to your index name

print("✅ API keys configured")
print(f"📁 Using Pinecone index: {INDEX_NAME}")
```

## Initialize Components

Set up the embeddings, vector store connection, and retriever.

```python
# 1. Initialize embeddings (same as used for Pinecone upload)
print("🔄 Initializing embeddings...")
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# 2. Connect to existing Pinecone index
print("🔗 Connecting to Pinecone...")
vector_store = Pinecone.from_existing_index(
    index_name=INDEX_NAME,
    embedding=embeddings
)

# 3. Set up retriever
print("🎯 Setting up retriever...")
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Retrieve top 3 most relevant chunks
)

print("✅ All components initialized successfully!")
```

## Create Custom Prompt

Design a German-language prompt optimized for GDPR questions and GPT-5 Nano.

```python
# Create optimized prompt for German GDPR questions
prompt_template = """Du bist ein spezialisierter Assistent für Datenschutzfragen im Handwerk. 
Beantworte die Frage basierend ausschließlich auf dem bereitgestellten Kontext. 
Sei präzise und sachlich.

Kontext: {context}

Frage: {question}

Antwort (auf Deutsch, kurz und fachlich):"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

print("📝 Prompt template created:")
print(PROMPT.template)
```

## Initialize QA Agent

Set up the GPT-5 Nano model and create the retrieval QA chain.

```python
# Initialize GPT-5 Nano model
print("🤖 Initializing GPT-5 Nano...")
llm = ChatOpenAI(
    model="gpt-5-nano",
    temperature=0.1,      # Low temperature for factual answers
    max_tokens=500,       # Limit response length
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# Create the QA chain
print("🔗 Creating QA chain...")
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",   # Simple and efficient
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True  # To see which documents were used
)

print("✅ QA agent created successfully!")
```

## Test Function

Create a helper function to test the agent and display results.

```python
def ask_gdpr_question(question, show_sources=True):
    """
    Ask a question to the GDPR assistant and display the response with sources.
    
    Args:
        question (str): The question to ask (in German or English)
        show_sources (bool): Whether to display source documents
    
    Returns:
        dict: Complete result with answer and source documents
    """
    print(f"❓ Frage: {question}")
    print("⏳ Denke nach...")
    
    # Get answer from QA chain
    result = qa_chain({"query": question})
    
    print(f"✅ Antwort: {result['result']}")
    
    # Show source documents if requested
    if show_sources and result['source_documents']:
        print(f"\n📚 Verwendete Quellen ({len(result['source_documents'])}):")
        for i, doc in enumerate(result['source_documents']):
            source_text = doc.page_content.replace('\n', ' ').strip()
            print(f"   {i+1}. {source_text[:150]}...")
    
    print("―" * 80)
    return result
```

## Test the RAG System

Now let's test the system with various GDPR questions.

```python
# Test 1: Basic GDPR principles
print("🧪 TEST 1: Grundlegende Datenschutzgrundsätze")
result1 = ask_gdpr_question("Was sind die Grundsätze der Datenverarbeitung im Handwerk?")
```

```python
# Test 2: Data retention periods
print("🧪 TEST 2: Aufbewahrungsfristen")
result2 = ask_gdpr_question("Wie lange dürfen Kundendaten gespeichert werden?")
```

```python
# Test 3: Data breach procedures
print("🧪 TEST 3: Datenpannen")
result3 = ask_gdpr_question("Was muss ich tun bei einer Datenschutzverletzung?")
```

```python
# Test 4: Employee data
print("🧪 TEST 4: Mitarbeiterdaten")
result4 = ask_gdpr_question("Welche Regeln gelten für die Verarbeitung von Mitarbeiterdaten?")
```

## Advanced Testing

Test with more specific scenarios to evaluate the system's performance.

```python
# Test with more specific scenarios
specific_tests = [
    "Muss ich für Marketing-E-Mails immer eine Einwilligung haben?",
    "Was ist eine Datenschutzfolgenabschätzung und wann ist sie erforderlich?",
    "Darf ich Fotos von meinen Handwerksarbeiten auf der Website verwenden?",
    "Wie behandle ich Daten von Lieferanten und Partnern?",
]

print("🧪 SPEZIFISCHE TESTS")
for i, question in enumerate(specific_tests, 1):
    print(f"\nTest {i}/4:")
    ask_gdpr_question(question, show_sources=True)
```

## Verify Source Quality

Check if the retrieved documents are relevant to the questions.

```python
def analyze_source_relevance(question, top_k=5):
    """
    Analyze which documents are being retrieved for a question
    """
    print(f"🔍 Analyzing sources for: '{question}'")
    
    # Get documents directly from retriever
    docs = retriever.get_relevant_documents(question)
    
    print(f"📄 Retrieved {len(docs)} documents:")
    for i, doc in enumerate(docs):
        print(f"\n--- Document {i+1} ---")
        content_preview = doc.page_content.replace('\n', ' ').strip()
        print(f"Content: {content_preview[:200]}...")
        if hasattr(doc, 'metadata'):
            print(f"Metadata: {doc.metadata}")

# Test source relevance
analyze_source_relevance("Datenaufbewahrung Kunden")
```

## Save the QA Agent

Save the configured agent for use in other notebooks or applications.

```python
# Function to quickly recreate the QA agent
def get_qa_agent():
    """
    Returns a pre-configured QA agent for GDPR questions
    """
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vector_store = Pinecone.from_existing_index(INDEX_NAME, embeddings)
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    
    llm = ChatOpenAI(
        model="gpt-5-nano",
        temperature=0.1,
        max_tokens=500
    )
    
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )

print("✅ QA agent function saved - ready for POC 1 completion!")
```

## Next Steps

Your POC 1 is now complete! Here's what you've accomplished:

✅ **POC 1 Completed**: Ask questions in German, receive answers in German based on ZDH guidelines

**Next steps for POC 2** (Multilingual support):
1. Modify the prompt to detect input language
2. Add language switching logic
3. Test with English questions

**Quick test to verify everything works:**

```python
# Final verification
final_test = ask_gdpr_question("Was sind meine Pflichten als Handwerksbetrieb bezüglich Datenschutz?")
print("🎉 POC 1 successfully implemented!")
```

This notebook gives you a complete, working RAG system for your GDPR Compliance Assistant. Each cell can be run independently, and the entire system is now ready for your POC 1 demonstration!

-----
----


# Draft

In [None]:
# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa.run(query)

In [None]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [None]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)