In [1]:
import os
import re
import uuid
import glob
from typing import List, Dict, Tuple, Optional

import pandas as pd
from dotenv import load_dotenv
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import RedirectResponse
from pydantic import BaseModel

# OpenAI (Azure)
from openai import AzureOpenAI

In [2]:
# LangChain (modern split)
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.retrievers import AzureAISearchRetriever
# from langchain_community.vectorstores import AzureSearch
from langchain_text_splitters import TokenTextSplitter

# Azure AI Search vector store
from langchain_azure_ai.vectorstores.azuresearch import AzureSearch

load_dotenv()

True

In [3]:
# ---------- Environment ----------
CSV_PATH = os.getenv(
    "CSV_PATH",
    "/mnt/data/gdpr_cased_articles_with_recitals.csv",  # your uploaded file path
)

In [4]:
# ---------- Config / Clients ----------

AOAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
AOAI_KEY = os.environ["AZURE_OPENAI_KEY"]
CHAT_DEPLOYMENT = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"]
EMB_DEPLOYMENT = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
AOAI_CHAT_API_VERSION = os.getenv("AZURE_OPENAI_CHAT_API_VERSION")
AOAI_EMB_API_VERSION = os.getenv("AZURE_OPENAI_EMBED_API_VERSION")
AOAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

SEARCH_ENDPOINT = os.environ["AZURE_SEARCH_ENDPOINT"]  # https://<service>.search.windows.net
SEARCH_KEY = os.environ["AZURE_SEARCH_API_KEY"]
SEARCH_INDEX = os.getenv("AZURE_SEARCH_INDEX_NAME")

# Remove legacy OpenAI globals that can clash with Azure mode
for k in ("OPENAI_API_BASE", "OPENAI_BASE_URL", "OPENAI_ORG"):
    os.environ.pop(k, None)

In [5]:
SEARCH_INDEX

'mukul-ai-search-index'

In [6]:
# ---------- Clients ----------
chat_client = AzureOpenAI(
    azure_endpoint=AOAI_ENDPOINT,
    api_key=AOAI_KEY,
    api_version=AOAI_API_VERSION,
)

In [7]:
azure_endpoint: str = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_key: str = os.getenv("AZURE_OPENAI_KEY")
azure_openai_api_version: str = os.getenv("AZURE_OPENAI_API_VERSION")
azure_deployment: str = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")

embeddings = AzureOpenAIEmbeddings(
    model=azure_deployment,
    azure_endpoint=azure_endpoint,
    openai_api_key=azure_openai_api_key,
)

In [8]:
azure_openai_api_version

'2024-02-01'

In [9]:
# Should return a list of floats of expected length (e.g., 3072 for text-embedding-3-large)
print(len(embeddings.embed_query("hello world")))

3072


In [10]:
# If you know your embedding model's dimension, set it here (recommended):
# text-embedding-3-large -> 3072; text-embedding-3-small -> 1536
EMBED_DIM = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIM", "3072"))

# Azure Semantic Search
What semantic search adds (if you set semantic_search_enabled=True)

Azure’s Semantic Search is a different, proprietary reranker layer (part of Azure AI Search, not OpenAI).
When you enable it:

- It uses Microsoft’s internal transformer models to re-rank the top n text results.

- It can perform semantic captions, highlights, and reasoning over plain text (not just vectors).

- It’s great for enterprise keyword-based search (“meaningful snippets”, “why did this doc match?”).

**It requires:**

The Semantic Search feature enabled on your Search service (Standard S or higher).

A semantic configuration defined in your index (usually named "default" or "semanticconfig1").

**However:**

The built-in semantic ranker is not aware of your OpenAI embeddings.
- It operates on text fields, not vector fields.

- It costs extra (depending on your tier) and slightly increases latency.

- It’s redundant if your retrieval is already vector-based and you do reranking in LangChain or the LLM itself.

In [11]:
REQUIRED_COLS = {"article_id", "article_title", "article_text", "article_recitals"}

def _normalize_recitals(value) -> list[str]:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return []
    s = str(value).strip().strip('"').strip("'")
    if not s:
        return []
    return [x.strip() for x in s.split(",") if x.strip()]

def _sort_recitals(rec_set: set[str]) -> list[str]:
    def sort_key(x: str):
        xs = x.strip()
        return (0, int(xs)) if xs.isdigit() else (1, xs.lower())
    return sorted(rec_set, key=sort_key)

def _compose_article_text(title: str, text: str, recitals: list[str]) -> str:
    t = (text or "").strip()
    title = (title or "").strip()
    rec_str = ", ".join(recitals) if recitals else "—"
    return f"## {title}\n\n{t}\n\n[Recitals: {rec_str}]\n"

def load_csv_documents(csv_path: str, merge_by_article_id: bool = True) -> list[Document]:
    if not os.path.isfile(csv_path):
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    df = pd.read_csv(csv_path)  # add encoding=... if needed
    missing = REQUIRED_COLS - set(df.columns)
    if missing:
        raise ValueError(f"CSV missing required columns: {sorted(missing)}")

    # Replace real NaNs only, then cast to str
    for col in ["article_id", "article_title", "article_text", "article_recitals"]:
        df[col] = df[col].where(df[col].notna(), "")
        df[col] = df[col].astype(str)

    if not merge_by_article_id:
        docs: list[Document] = []
        for idx, row in df.iterrows():
            recs = _normalize_recitals(row["article_recitals"])
            content = _compose_article_text(row["article_title"], row["article_text"], recs)
            if content.strip():
                docs.append(Document(
                    page_content=content,
                    metadata={
                        "source": csv_path,
                        "doc_id": str(uuid.uuid4()),
                        "row_index": int(idx),
                        "article_id": row["article_id"].strip(),
                        "article_title": row["article_title"].strip(),
                        "recitals": recs,
                        "merge_mode": "row",
                    }
                ))
        return docs

    # Merge rows with same article_id (title can vary slightly)
    grouped: dict[str, dict[str, object]] = {}
    for idx, row in df.iterrows():
        aid = row["article_id"].strip()
        title = row["article_title"].strip()
        text = (row["article_text"] or "").strip()
        recs = _normalize_recitals(row["article_recitals"])

        if aid not in grouped:
            grouped[aid] = {"titles": [], "texts": [], "recitals": set(), "rows": []}
        if title:
            grouped[aid]["titles"].append(title)
        if text:
            grouped[aid]["texts"].append(text)
        grouped[aid]["recitals"].update(recs)
        grouped[aid]["rows"].append(int(idx))

    docs: list[Document] = []
    for aid, bundle in grouped.items():
        merged_text = "\n\n".join(bundle["texts"]).strip()
        # choose the most common title for this article_id
        titles = [t for t in bundle["titles"] if t]
        title = max(set(titles), key=titles.count) if titles else aid
        rec_list = _sort_recitals(bundle["recitals"])
        content = _compose_article_text(title, merged_text, rec_list)
        if content.strip():
            docs.append(Document(
                page_content=content,
                metadata={
                    "source": csv_path,
                    "doc_id": str(uuid.uuid4()),
                    "article_id": aid,
                    "article_title": title,
                    "row_indices": bundle["rows"],
                    "recitals": rec_list,
                    "merge_mode": "article_id",
                }
            ))
    return docs

In [12]:
docs_generated = load_csv_documents(csv_path=CSV_PATH)

In [13]:
print(docs_generated[2])

page_content='## Territorial scope

This Regulation applies to the processing of personal data in the context of the activities of an establishment of a controller or a processor in the Union, regardless of whether the processing takes place in the Union or not.

This Regulation applies to the processing of personal data of data subjects who are in the Union by a controller or processor not established in the Union, where the processing activities are related to: the offering of goods or services, irrespective of whether a payment of the data subject is required, to such data subjects in the Union;

This Regulation applies to the processing of personal data of data subjects who are in the Union by a controller or processor not established in the Union, where the processing activities are related to: the monitoring of their behaviour as far as their behaviour takes place within the Union.

This Regulation applies to the processing of personal data by a controller not established in the 

In [14]:
len(docs_generated)

99

In [15]:
# ---------- Chunking ----------
def chunk_docs(
    docs: List[Document],
    chunk_size: int = 1000,
    chunk_overlap: int = 0,
) -> List[Document]:
    """
    GDPR articles are long but formal; 1000/200 works well.
    Includes per-chunk offsets & ids for better citations.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )

    chunks: List[Document] = []
    for d in docs:
        pieces = splitter.split_text(d.page_content)
        for i, text in enumerate(pieces):
            # Estimate character offsets in a simple way
            char_start = sum(len(p) for p in pieces[:i]) - max(i * chunk_overlap, 0)
            char_start = max(char_start, 0)
            char_end = char_start + len(text)
            meta = dict(d.metadata)
            meta.update({
                "chunk_id": i,
                "chunk_count": len(pieces),
                "char_start": char_start,
                "char_end": char_end,
            })
            chunks.append(Document(page_content=text, metadata=meta))
    return chunks

In [16]:
chunks_generated = chunk_docs(docs_generated)

In [17]:
print(chunks_generated[5])

page_content='## Definitions

'personal data' means any information relating to an identified or identifiable natural person ('data subject'); an identifiable natural person is one who can be identified, directly or indirectly, in particular by reference to an identifier such as a name, an identification number, location data, an online identifier or to one or more factors specific to the physical, physiological, genetic, mental, economic, cultural or social identity of that natural person;

'processing' means any operation or set of operations which is performed on personal data or on sets of personal data, whether or not by automated means, such as collection, recording, organisation, structuring, storage, adaptation or alteration, retrieval, consultation, use, disclosure by transmission, dissemination or otherwise making available, alignment or combination, restriction, erasure or destruction;' metadata={'source': '/Users/manjubudha/Desktop/Workspace/Colab_Projects/rag_azure_fastapi

In [18]:
len(chunks_generated)

371

In [19]:
# Vector store (Azure AI Search)
# semantic_search_enabled=False
# Use the vector index for retrieval (cosine / dot-product similarity on embeddings) — do not call the Azure semantic ranker service.
def make_vectorstore() -> AzureSearch:
    """
    Pure vector retrieval (fast + cheap). You can enable Azure Semantic Ranker later if needed.
    """
    return AzureSearch(
        azure_search_endpoint=SEARCH_ENDPOINT,
        azure_search_key=SEARCH_KEY,
        index_name=SEARCH_INDEX,
        embedding_function=embeddings.embed_query,
    )

In [20]:
vs = make_vectorstore()

In [21]:
vs.add_documents(chunks_generated)

['NmVhYzBhNGQtN2NlMi00MzVhLTk3NjEtMjljMTgyMjc0Yzkz',
 'NWExODliMzMtMTA0OC00OWUzLTk1ZGEtYjhjZWZjZDE1Mjgx',
 'MzYxMDcwZjItNDc4OS00OTQ0LWI0MjAtOTdhZTAyNmM1ZDBl',
 'NDM1ODQwZWItYjRkYi00NDQxLThiMWQtOTk2M2I0MTgzOGVm',
 'NmM5ZDg5MzAtNjBkNy00MjliLWI2NjQtMzMwYTEzNmJlYmVl',
 'NGU1NGY0MjItYzg3MS00ZTI3LThjOTAtZDYzYjkxMDZjM2Zh',
 'MDI0NWE4NjQtZDIzOS00NDMwLWJkYTItYThmMjFiNDE4OGQ5',
 'YjYzMjQ4YzAtMWZhZC00MjczLTg5N2MtNzkwMDYyYmRhNmUx',
 'ODEwYjUzNjgtMzgzOC00YjFhLTg1OTktM2Y4ZWUyODFmODFl',
 'MzM5ZDAzNzktYWExMy00ZjNkLWI2NmUtZGM2ZDEwYzUwODEz',
 'NjkzYjdmYjktOTE2Zi00Y2Q2LTgwODgtZmI2ZjE1ZWM5MTdh',
 'M2Q5NGNlOTktZjQ2NC00Njk1LWFhZGQtZmI3YmY4NzZjNTU2',
 'ZjE5YzY3NDktY2Q5MS00YjA0LWExNWItNjM4NTBiMDI3ZDFj',
 'Mzc0NzlhOWYtMjU5Ni00ZDJlLThmN2MtMGE3NmVhYmY1Njg1',
 'MTEzMmMxNTMtMWM3MC00NmY1LThmM2UtMGYyMDcxOTg1YzBm',
 'MDhjMWNhN2ItNDYxZi00MWY3LTk3MzktYmEwOTQ0Y2FiMWFi',
 'N2VjZjE4MWItMzdlOC00NjE3LTllMGMtM2Y5ZjIwNzUyYmEy',
 'MmZmYzM1NzktMzA1ZS00MmI4LTk2NzQtYjc4NWExMThmNzEz',
 'MWE2ZTIxOTUtNjM1NC00ZTM5LWIxODYtZWM0MzczODM5

Here are 10 retrieval-friendly queries you can run against your GDPR vector store (each targets a specific article/topic present in your CSV):

1. What are the lawful bases for processing under GDPR Article 6?
2. What activities are covered by the GDPR’s territorial scope in Article 3, including non-EU controllers targeting people in the EU?
3. List the core principles for processing personal data in GDPR Article 5.
4. What conditions must be met for valid consent, and how can consent be withdrawn, according to Article 7?
5. When is a Data Protection Impact Assessment required under Article 35, and what does it need to consider?
6. Within how many hours must a controller notify the supervisory authority of a personal data breach under Article 33, and what details must be included?
7. When must a controller communicate a personal data breach to data subjects under Article 34?
8. What information must be provided to data subjects when data are collected directly from them under Article 13?
9. What contractual requirements apply to processors and sub-processors under Article 28?
10. What are the administrative fine tiers and assessment criteria in GDPR Article 83?


In [32]:
docs_queried = vs.similarity_search_with_relevance_scores(
    query="What are the lawful bases for processing under GDPR Article 6?",
    k=5,
)

In [33]:
print(dir(docs_queried[0][0]))

['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_on_complete__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_setattr_handlers__', '__pydantic_

In [34]:
print(docs_queried[0][0].page_content)

## Lawfulness of processing

Processing shall be lawful only if and to the extent that at least one of the following applies:  the data subject has given consent to the processing of his or her personal data for one or more specific purposes;

Processing shall be lawful only if and to the extent that at least one of the following applies: processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract;

Processing shall be lawful only if and to the extent that at least one of the following applies: processing is necessary for compliance with a legal obligation to which the controller is subject;

Processing shall be lawful only if and to the extent that at least one of the following applies: processing is necessary in order to protect the vital interests of the data subject or of another natural person;


In [35]:
print(docs_queried[1][0].page_content)

Processing shall be lawful only if and to the extent that at least one of the following applies: processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller;

Processing shall be lawful only if and to the extent that at least one of the following applies: processing is necessary for the purposes of the legitimate interests pursued by the controller or by a third party, except where such interests are overridden by the interests or fundamental rights and freedoms of the data subject which require protection of personal data, in particular where the data subject is a child. This shall not apply to processing carried out by public authorities in the performance of their tasks.


In [36]:
print(docs_queried[4][0].page_content)

## Processing of personal data relating to criminal convictions and offences

Processing of personal data relating to criminal convictions and offences or related security measures based on Article 6(1) shall be carried out only under the control of official authority or when the processing is authorised by Union or Member State law providing for appropriate safeguards for the rights and freedoms of data subjects. Any comprehensive register of criminal convictions shall be kept only under the control of official authority.

[Recitals: —]


In [38]:
deployment = CHAT_DEPLOYMENT
response = chat_client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that helps GDPR Compliance related topics.",
        },
        {
            "role": "user",
            "content": "What are the lawful bases for processing under GDPR Article 6?",
        },
        {
            "role": "assistant", 
            "content": docs_queried[0][0].page_content
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

print(response.choices[0].message.content)

Under **Article 6 of the GDPR**, personal data processing must have a lawful basis, and at least one of the following six conditions must apply:

### 1. **Consent** (Article 6(1)(a)):
   - The data subject has freely given their **specific, informed, and unambiguous consent** for the processing of their personal data for one or more specific purposes.
   - Consent must be explicit when processing sensitive or special category data (e.g., health data).
   - Data subjects have the right to withdraw their consent at any time.

### 2. **Contractual Necessity** (Article 6(1)(b)):
   - Processing is necessary for the **performance of a contract** to which the data subject is a party, or to take steps at the request of the data subject **prior to entering into a contract**.
   - Example: To process data for delivering a product purchased online or providing a service.

### 3. **Legal Obligation** (Article 6(1)(c)):
   - Processing is necessary for compliance with a **legal obligation** impose

In [39]:
deployment = CHAT_DEPLOYMENT
response = chat_client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that helps GDPR Compliance related topics.",
        },
        {
            "role": "user",
            "content": "What are the lawful bases for processing under GDPR Article 6?",
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

print(response.choices[0].message.content)

Under Article 6 of the **General Data Protection Regulation (GDPR)**, there are six lawful bases for processing personal data. Organizations must identify and document at least one of these before processing personal data to ensure compliance. The lawful bases are:

1. **Consent** (Article 6(1)(a)):  
   The data subject has explicitly consented to the processing of their personal data for one or more specific purposes. Consent must be freely given, specific, informed, and unambiguous, and there must be clear evidence of the data subject's agreement.

2. **Contract** (Article 6(1)(b)):  
   Processing is necessary for the performance of a contract to which the data subject is a party, or to take steps at the request of the data subject prior to entering into a contract. For example, processing may be required to deliver goods or fulfill contractual obligations.

3. **Legal Obligation** (Article 6(1)(c)):  
   Processing is necessary for compliance with a legal obligation to which the d

In [21]:
# ---------- Indexing ----------
def build_or_refresh_index(chunks: List[Document]) -> AzureSearch:
    vs = make_vectorstore()
    # add_documents will embed each chunk via embedding_function
    vs.add_documents(chunks)
    return vs

# Wrong query 

In [41]:
docs_queried = vs.similarity_search_with_relevance_scores(
    query="What are the administrative fine tiers and assessment criteria in GDPR Article 83?",
    k=5,
)
print(docs_queried[0][0].page_content)

## General conditions for imposing administrative fines

Each supervisory authority shall ensure that the imposition of administrative fines pursuant to Article 83 in respect of infringements of this Regulation referred to in Article 83(4)-83(6) shall in each individual case be effective, proportionate and dissuasive.

Administrative fines shall, depending on the circumstances of each individual case, be imposed in addition to, or instead of, measures referred to in  Article 58(2). When deciding whether to impose an administrative fine and deciding on the amount of the administrative fine in each individual case due regard shall be given to the following: the nature, gravity and duration of the infringement taking into account the nature scope or purpose of the processing concerned as well as the number of data subjects affected and the level of damage suffered by them;


In [43]:
deployment = CHAT_DEPLOYMENT
response = chat_client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that helps GDPR Compliance related topics.",
        },
        {
            "role": "user",
            "content": "What are the lawful bases for processing under GDPR Article 6?",
        },
        {
            "role": "assistant", 
            "content": docs_queried[0][0].page_content
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0,
    model=deployment
)

from pprint import pprint
pprint(response)
pprint(response.choices[0].message.content)

ChatCompletion(id='chatcmpl-CXlFYs1c2Rcf5OjYZH7wq4EqEooui', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Under GDPR Article 6, lawful bases for processing personal data are the foundational principles organizations must rely on to ensure their data processing activities are compliant with the regulation. There are six lawful bases listed in Article 6(1) that justify the processing of personal data. These are:\n\n1. **Consent (Article 6(1)(a)):**  \n   The data subject has given explicit and freely-given consent for their data to be processed for a specific purpose. Consent must be clear and informed, with individuals having the ability to withdraw consent at any time.\n\n2. **Contractual necessity (Article 6(1)(b)):**  \n   Processing is necessary to perform a contract with the data subject or to take steps prior to entering into a contract (e.g., fulfilling orders, providing services, or handling contracts).\n\n3. **Legal obligat

In [44]:
pprint(response.choices[0].message.content)

('Under GDPR Article 6, lawful bases for processing personal data are the '
 'foundational principles organizations must rely on to ensure their data '
 'processing activities are compliant with the regulation. There are six '
 'lawful bases listed in Article 6(1) that justify the processing of personal '
 'data. These are:\n'
 '\n'
 '1. **Consent (Article 6(1)(a)):**  \n'
 '   The data subject has given explicit and freely-given consent for their '
 'data to be processed for a specific purpose. Consent must be clear and '
 'informed, with individuals having the ability to withdraw consent at any '
 'time.\n'
 '\n'
 '2. **Contractual necessity (Article 6(1)(b)):**  \n'
 '   Processing is necessary to perform a contract with the data subject or to '
 'take steps prior to entering into a contract (e.g., fulfilling orders, '
 'providing services, or handling contracts).\n'
 '\n'
 '3. **Legal obligation (Article 6(1)(c)):**  \n'
 '   Processing is necessary to comply with a legal obligatio

In [46]:
# Retrieve Docs
query = "What are the lawful bases for processing under GDPR Article 6?"

docs_queried = vs.similarity_search_with_relevance_scores(
    query=query,
    k=3,  # Increased to 3 for better grounding
)

# Merge all retrieved chunks into one text block
retrieved_context = "\n\n---\n\n".join(
    [doc[0].page_content for doc in docs_queried]
)

deployment = CHAT_DEPLOYMENT

response = chat_client.chat.completions.create(
    model=deployment,
    max_tokens=1024,
    temperature=0.2,  # lower temp = less hallucination
    top_p=1.0,
    messages=[
        {
            "role": "system",
            "content": (
                "You are a GDPR Compliance Expert. Answer ONLY using the provided context.\n"
                "If the answer is not explicitly stated in the context, respond with:\n"
                "'The answer is not found in the provided GDPR dataset.'\n\n"
                "CRITICAL RULES:\n"
                "1. Use only facts from the context below (do NOT use external knowledge).\n"
                "2. Include citations by referencing Article or Recital numbers if present.\n"
                "3. Do NOT hallucinate, guess, or add extra info not present in the context.\n"
                "4. Be concise, accurate, and compliant.\n"
            ),
        },
        {
            "role": "user",
            "content": (
                f"Context:\n{retrieved_context}\n\n"
                f"Question: {query}\n\n"
                "Provide the answer with relevant citations."
            ),
        },
    ],
)

print(response.choices[0].message.content)


The lawful bases for processing under GDPR Article 6 are:

1. **Consent**: The data subject has given consent to the processing of their personal data for one or more specific purposes (Article 6(1)(a)).
2. **Contract**: Processing is necessary for the performance of a contract to which the data subject is party or to take steps at the request of the data subject prior to entering into a contract (Article 6(1)(b)).
3. **Legal obligation**: Processing is necessary for compliance with a legal obligation to which the controller is subject (Article 6(1)(c)).
4. **Vital interests**: Processing is necessary to protect the vital interests of the data subject or another natural person (Article 6(1)(d)).
5. **Public interest or official authority**: Processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller (Article 6(1)(e)).
6. **Legitimate interests**: Processing is necessary for the purposes of th

In [47]:
# Retrieve Docs
query = "What are the lawful bases for processing under GDPR Article 1000, section 18, of IPC 800 and EU Act 12?"

docs_queried = vs.similarity_search_with_relevance_scores(
    query=query,
    k=3,  # Increased to 3 for better grounding
)

# Merge all retrieved chunks into one text block
retrieved_context = "\n\n---\n\n".join(
    [doc[0].page_content for doc in docs_queried]
)

deployment = CHAT_DEPLOYMENT

response = chat_client.chat.completions.create(
    model=deployment,
    max_tokens=1024,
    temperature=0.2,  # lower temp = less hallucination
    top_p=1.0,
    messages=[
        {
            "role": "system",
            "content": (
                "You are a GDPR Compliance Expert. Answer ONLY using the provided context.\n"
                "If the answer is not explicitly stated in the context, respond with:\n"
                "'The answer is not found in the provided GDPR dataset.'\n\n"
                "CRITICAL RULES:\n"
                "1. Use only facts from the context below (do NOT use external knowledge).\n"
                "2. Include citations by referencing Article or Recital numbers if present.\n"
                "3. Do NOT hallucinate, guess, or add extra info not present in the context.\n"
                "4. Be concise, accurate, and compliant.\n"
            ),
        },
        {
            "role": "user",
            "content": (
                f"Context:\n{retrieved_context}\n\n"
                f"Question: {query}\n\n"
                "Provide the answer with relevant citations."
            ),
        },
    ],
)

print(response.choices[0].message.content)


The answer is not found in the provided GDPR dataset.
