<a href="https://colab.research.google.com/github/Amgad-Abdelkhaleq/GDPR-RAG/blob/main/GDPR_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#installation

In [2]:
!pip install langchain langchain_community qdrant-client pypdf transformers sentence-transformers langchain_qdrant




#GDPR RAG

In [None]:
import os
from typing import List, Dict, Optional
from dataclasses import dataclass
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import pipeline
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import os
from langchain_community.chat_models.sambanova import ChatSambaNovaCloud
import re
from qdrant_client.http.models import Filter, FieldCondition, MatchAny


@dataclass
class GDPRArticle:
    article_number: int
    content: str
    summary: str

class PDFExtractor:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path

    def extract(self) -> List[str]:
        loader = PyPDFLoader(self.pdf_path)
        pages = loader.load()
        return [page.page_content for page in pages]

class ArticleParser:
    def __init__(self, article_summaries: Dict[int, str]):
        self.article_summaries = article_summaries

    def parse_articles(self, raw_text: List[str]) -> List[GDPRArticle]:
        combined_text = " ".join(raw_text)
        articles = []

        # Split by "Article X" pattern
        for i in range(1, 22):
            start_marker = f"Article {i}"
            end_marker = f"Article {i+1}" if i < 21 else ""

            start_idx = combined_text.find(start_marker)
            end_idx = combined_text.find(end_marker) if end_marker else len(combined_text)

            if start_idx != -1:
                content = combined_text[start_idx:end_idx].strip()
                articles.append(GDPRArticle(
                    article_number=i,
                    content=content,
                    summary=self.article_summaries.get(i, "")
                ))

        print(f"num of articles is: {len(articles)}")
        return articles

class DocumentPreprocessor:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def split_articles(self, articles: List[GDPRArticle]) -> List[Dict]:
        documents = []
        for article in articles:
            chunks = self.text_splitter.split_text(article.content)
            for chunk in chunks:
                documents.append({
                    "text": chunk,
                    "metadata": {
                        "article_number": article.article_number,
                        "article_summary": article.summary
                    }
                })
        return documents

class VectorStore:
    def __init__(self, initialize_client=True):
        self.embeddings = HuggingFaceEmbeddings(model_name="microsoft/mdeberta-v3-base")
        if initialize_client:
            self.client = QdrantClient(path="/content/drive/MyDrive/qdrant_storage_copy")
        self.collection_name = "gdpr_articles"

    def create_collection(self):
        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config=VectorParams(size=768, distance=Distance.COSINE)
        )

    def index_documents(self, documents: List[Dict]):
        texts = [doc["text"] for doc in documents]
        metadatas = [doc["metadata"] for doc in documents]

        vector_store = QdrantVectorStore(
        client=self.client,
        collection_name=self.collection_name,
        embedding=self.embeddings,
        ).add_texts(
            texts=texts,
            metadatas=metadatas,
        )

        return vector_store

class ArticleClassifier:
    def __init__(self):
        self.classifier = pipeline(
            "zero-shot-classification",
            model="facebook/bart-large-mnli"
        )

    def predict_relevant_articles(self, question: str, article_summaries: Dict[int, str]) -> List[int]:
        # Create candidate labels using both article numbers and summaries
        candidates = []
        for article_num, summary in article_summaries.items():
            # Format: "Article X: summary"
            candidate = f"Article {article_num}: {summary}"
            candidates.append({
                'full_label': candidate,
                'article_num': article_num
            })

        # Perform classification with detailed labels
        results = self.classifier(
            question,
            candidate_labels=[c['full_label'] for c in candidates],
            hypothesis_template="This question is related to {}, and its contents.",
            multi_label=True
        )
        print(f"Article selection result: {results}")


        # Results are already sorted, just filter by threshold and take top 3
        top_articles = [int(re.search(r"Article (\d+)", article_label).group(1)) for article_label in results['labels'] if re.search(r"Article (\d+)", article_label)]

        return top_articles[0:3]

class RAGSystem:
    def __init__(self, vector_store: QdrantVectorStore, article_classifier: ArticleClassifier):
        self.vector_store = vector_store
        self.article_classifier = article_classifier
        self.llm = ChatSambaNovaCloud(
                sambanova_api_key="b8eb270b-778f-457d-8b2f-5f957df86b51",
                model="Meta-Llama-3.1-8B-Instruct",
                max_tokens=1024,
                temperature=0.7,
                top_k=50,
                top_p=1.0,
                repetition_penalty = 1.0,
        )

        self.qa_prompt = PromptTemplate(
            template="""You are a helpful assistant that answers questions about GDPR articles.
            Use the following pieces of context to answer the question at the end.
            If you don't know the answer based on the context, say that you don't know.

            Context: {context}

            Question: {question}

            Answer:""",
            input_variables=["context", "question"]
        )

    def answer_question(self, question: str, articles_summaries:Dict[int, str]) -> str:
        # Predict relevant articles
        relevant_articles = self.article_classifier.predict_relevant_articles(question, articles_summaries)
        print(f"relevant articles: {relevant_articles}")

        # Create a structured filter that always exists
        article_filter = Filter(
            must=[
                FieldCondition(
                    key="article_number",
                    match=MatchAny(any=relevant_articles)
                )
            ]
        )

        search_kwargs = {
            "k": 3,
            "filter": article_filter
        }

        retriever = self.vector_store.as_retriever(search_kwargs=search_kwargs)

        # Create QA chain
        print("creating QA chain")
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs={"prompt": self.qa_prompt}
        )

        return qa_chain.run(question)

def main(mode=None):
  # Article summaries
  ARTICLE_SUMMARIES = {
        1: "Subject-matter and objectives: Outlines GDPR's purpose to protect individuals' rights regarding personal data.",
        2:"Material scope: Specifies data processing activities under GDPR.",
        3:"Territorial scope: Defines geographical applicability of GDPR.",
        4:"Definitions: Provides definitions for key GDPR terms.",
        5:"Principles relating to processing: Core principles for data processing.",
        6:"Lawfulness of processing: Specifies lawful bases for data processing.",
        7:"Conditions for consent: Details conditions for valid consent.",
        8:"Child's consent: Sets age of consent for information society services.",
        9:"Special categories of data: Prohibits processing of sensitive data.",
        10:"Criminal data processing: Requires legal basis for such data.",
        11:"Non-identification processing: Covers non-identifiable data scenarios.",
        12:"Transparent information: Obligates concise, accessible communication.",
        13:"Direct collection info: Information required when collecting directly.",
        14:"Indirect collection info: Information required for indirect collection.",
        15:"Right of access: Grants access rights to personal data.",
        16:"Right to rectification: Right to correct inaccurate data.",
        17:"Right to erasure: Allows data erasure under conditions.",
        18:"Right to restriction: Restrict processing under certain conditions.",
        19:"Notification obligation: Notify recipients about data changes.",
        20:"Data portability: Right to transfer data in a machine-readable format.",
        21:"Right to object: Object to processing for certain purposes."
    }
  if mode == None:
    # Initialize components
    pdf_extractor = PDFExtractor("/content/drive/MyDrive/GDPR Art 1-21.pdf")
    article_parser = ArticleParser(ARTICLE_SUMMARIES)
    doc_preprocessor = DocumentPreprocessor()
    vector_store = VectorStore()
    article_classifier = ArticleClassifier()

    # Process PDF and create vector store
    raw_text = pdf_extractor.extract()
    print("pdf extraction completed")
    articles = article_parser.parse_articles(raw_text)
    print("parse atrticles completed")
    documents = doc_preprocessor.split_articles(articles)
    print(f"preprocessing completed: {len(documents)}")

    vector_store.create_collection()
    qdrant_store = vector_store.index_documents(documents)
    print("indexing compeleted")

  elif mode == "inference":
    print("Inside inference mode")
    vector_store = VectorStore(initialize_client=False)
    qdrant_store = QdrantVectorStore.from_existing_collection(
    embedding=vector_store.embeddings,
    collection_name=vector_store.collection_name,
    path="/content/drive/MyDrive/qdrant_storage",
    )
    article_classifier = ArticleClassifier()


  # Initialize RAG system
  rag_system = RAGSystem(qdrant_store, article_classifier)

  # Interactive console
  print("GDPR Articles RAG System")
  print("Enter your questions about GDPR articles (type 'exit' to quit)")

  while True:
      question = input("\nYour question: ")
      if question.lower() == 'exit':
          break

      answer = rag_system.answer_question(question, ARTICLE_SUMMARIES)
      print("\nAnswer:", answer)

if __name__ == "__main__":
    main("inference")

Inside inference mode


  self.embeddings = HuggingFaceEmbeddings(model_name="microsoft/mdeberta-v3-base")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


GDPR Articles RAG System
Enter your questions about GDPR articles (type 'exit' to quit)

Your question: What are the main principles for processing personal data?
Article selection result: {'sequence': 'What are the main principles for processing personal data?', 'labels': ['Article 6: Lawfulness of processing: Specifies lawful bases for data processing.', 'Article 5: Principles relating to processing: Core principles for data processing.', 'Article 18: Right to restriction: Restrict processing under certain conditions.', 'Article 13: Direct collection info: Information required when collecting directly.', 'Article 11: Non-identification processing: Covers non-identifiable data scenarios.', 'Article 2: Material scope: Specifies data processing activities under GDPR.', 'Article 14: Indirect collection info: Information required for indirect collection.', 'Article 12: Transparent information: Obligates concise, accessible communication.', "Article 1: Subject-matter and objectives: Outlin

  return qa_chain.run(question)



Answer: The main principles for processing personal data, as outlined in the General Data Protection Regulation (GDPR), are:

1.  **Lawfulness, Fairness, and Transparency** (Article 5(1)(a)): Personal data shall be processed lawfully, fairly, and in a transparent manner.
2.  **Purpose Limitation** (Article 5(1)(b)): Personal data shall be collected for specific, explicit, and legitimate purposes.
3.  **Data Minimization** (Article 5(1)(c)): Personal data shall be adequate, relevant, and limited to what is necessary for the purposes for which it is processed.
4.  **Accuracy** (Article 5(1)(d)): Personal data shall be accurate and, where necessary, kept up to date.
5.  **Storage Limitation** (Article 5(1)(e)): Personal data shall not be kept for longer than necessary for the purposes for which it is processed.
6.  **Integrity and Confidentiality** (Article 5(1)(f)): Personal data shall be processed in a way that ensures their confidentiality, integrity, and availability.

These principl