# Load and reading the data from the GDPR

In [1]:
!pip install -U langchain-community
!pip install sentence_transformers
!pip install pinecone-client

Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.7 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.17 (from langchain-community)
  Downloading langchain_core-0.3.18-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.1.143-py3-none-any.whl.metadata (13 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain<0.4.0,>=0.3.7->langchain-community)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging<25,>=23.2 (from langchain

In [2]:
!pip install pinecone-client[grpc]
!pip install nltk
!pip install beautifulsoup4

Collecting protobuf<5.0,>=4.25 (from pinecone-client[grpc])
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone-client[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl (7.9 kB)
Installing collected packages: protobuf, protoc-gen-openapiv2
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.

In [3]:
import pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Initialize Pinecone with your API key and environment
api_key = "6ca3b069-c0d4-460e-8d64-9f545902b7c6"  # Replace with your actual Pinecone API key
environment = "us-east-1"  # Your Pinecone environment

# Initialize Pinecone client
pc = Pinecone(api_key=api_key)

# Define the index name and dimension
index_name = "chunk-embeddings-index"
dimension = 768  # Ensure this matches the dimension of your embeddings

# Create the index if it doesn't already exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region=environment
        ) 
    )

# Connect to the index
index = pc.Index(index_name)

In [36]:
import numpy as np
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import torch



# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz")
embedding_model = AutoModel.from_pretrained("distilbert-base-uncased")
text_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Function to chunk text based on token limit
def chunk_text_based_on_tokens(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to extract sections and articles from HTML
def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections

# Function to load and process HTML
def load_and_process_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    sections = extract_sections_articles_chapters(soup)
    all_chunks = []
    for section in sections:
        all_chunks.extend(chunk_text_based_on_tokens(section))
    return all_chunks

# Function to generate embeddings
def generate_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()  # Use [CLS] token embedding
    return embedding.tolist()  # Convert to a list for Pinecone

# Function to upsert chunks into Pinecone with metadata
def upsert_chunks_to_pinecone(index, namespace, chunks):
    vectors = []
    for i, chunk in enumerate(chunks):
        embedding = generate_bert_embedding(chunk, text_tokenizer, embedding_model)
        vectors.append({
            "id": f"{namespace}_id_{i}",  # Unique ID for each chunk
            "values": embedding,
            "metadata": {"text": chunk}  # Attach the chunk text as metadata
        })
    # Upsert vectors to Pinecone
    index.upsert(vectors=vectors, namespace=namespace)
    print(f"Upserted {len(vectors)} vectors to namespace '{namespace}'.")

# Main Function to Automate Chunking and Uploading
def process_and_upload_html(file_path, namespace):
    print(f"Processing file: {file_path} for namespace: {namespace}")
    chunks = load_and_process_html(file_path)  # Step 1: Create chunks
    upsert_chunks_to_pinecone(index, namespace, chunks)  # Step 2: Upload to Pinecone

# Example Usage
html_files_and_namespaces = {
    "gdpr": "/kaggle/input/english-dataset/english_gdpr.html",
    "ai_act": "/kaggle/input/english-dataset/english_AI_act.html",
    "dma": "/kaggle/input/english-dataset/english_dma.html",
    "dsa": "/kaggle/input/english-dataset/english_dsa.html"
}

for namespace, file_path in html_files_and_namespaces.items():
    process_and_upload_html(file_path, namespace)





Processing file: /kaggle/input/english-dataset/english_gdpr.html for namespace: gdpr
Upserted 239 vectors to namespace 'gdpr'.
Processing file: /kaggle/input/english-dataset/english_AI_act.html for namespace: ai_act
Upserted 399 vectors to namespace 'ai_act'.
Processing file: /kaggle/input/english-dataset/english_dma.html for namespace: dma
Upserted 289 vectors to namespace 'dma'.
Processing file: /kaggle/input/english-dataset/english_dsa.html for namespace: dsa
Upserted 289 vectors to namespace 'dsa'.


In [7]:
from sentence_transformers import SentenceTransformer

def embed_chunks(chunks, model_name):
    # Use SentenceTransformer for generating embeddings
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, normalize_embeddings=True)
    return embeddings

def process_and_store_embeddings(file_path, namespace, model_name):
    chunks = load_and_process_html(file_path)
    embeddings = embed_chunks(chunks, model_name)

    # Prepare vectors for upsert with unique IDs and correct namespace
    vectors_to_upsert = [(f"id_{namespace}_{i}", embedding.tolist()) for i, embedding in enumerate(embeddings)]
    index.upsert(vectors=vectors_to_upsert, namespace=law)  # Specify the namespace for each law

    return index, chunks, embeddings

def embed_query(query, model_name):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query], normalize_embeddings=True)
    return query_embedding[0]

def query_pinecone_db(query_embedding, index, namespace, top_k=1):
    results = index.query(vector=query_embedding, top_k=top_k, namespace=law)  # Include namespace for specific law
    return results


In [8]:
# File paths and index names for the different laws
laws_info = {
    'gdpr': {
        'file_path': '/kaggle/input/english-dataset/english_gdpr.html',
        'index_name': 'index_name',
        'query': "What are the key considerations for Member States when reconciling the right to freedom of expression and information with the right to the protection of personal data under this Regulation, and how should exemptions and derogations be applied in this context?"
    },
    'ai_act': {
        'file_path': '/kaggle/input/english-dataset/english_AI_act.html',
        'index_name': 'index_name',
        'query': "What are the implications of the proposed Regulation on the placement and use of high-risk AI systems with respect to existing Union laws, particularly in areas such as data protection, consumer rights, employment, and national labor laws?"
    },
    'dma': {
        'file_path': '/kaggle/input/english-dataset/english_dma.html',
        'index_name': 'index_name',
        'query': "What are the key steps and responsibilities of the Commission in addressing and remedying infringements by very large online platforms and search engines according to the text provided?"
    },
    'dsa': {
        'file_path': '/kaggle/input/english-dataset/english_dsa.html',
        'index_name': 'index_name',
        'query': "What distinguishes online platforms from other providers of hosting services according to the regulation, and why are cloud computing and web-hosting services generally not considered online platforms?"
    }
}

model_name = "distilbert-base-uncased"

# Process and store embeddings for each law
chunks_dict = {}
embeddings_dict = {}
import time

# Step 1: Check if the index exists and delete if it does
if index_name in pc.list_indexes().names():
    print(f"Deleting index '{index_name}'...")
    pc.delete_index(index_name)

    # Step 2: Wait for the index deletion to be processed
    print("Waiting for the index to be deleted...")
    time.sleep(5)  # Adjust the wait time if needed

    # Verify the index deletion
    if index_name not in pc.list_indexes().names():
        print(f"Index '{index_name}' successfully deleted.")
    else:
        print(f"Index '{index_name}' still exists, trying again...")
        pc.delete_index(index_name)
        time.sleep(5)  # Wait for another 5 seconds
        if index_name not in pc.list_indexes().names():
            print(f"Index '{index_name}' successfully deleted on second attempt.")
        else:
            print(f"Failed to delete index '{index_name}'. Please check your Pinecone dashboard.")
else:
    print(f"Index '{index_name}' does not exist or was already deleted.")

# Step 3: Recreate the index after ensuring deletion
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region=environment
    )
)
print(f"Index '{index_name}' created successfully.")

# Step 4: Upsert your chunks into the index with unique IDs and namespace
for law, info in laws_info.items():
    print(f"Processing {law}...")
    index, chunks, embeddings = process_and_store_embeddings(info['file_path'], info['index_name'], model_name)
    chunks_dict[law] = chunks  # Store chunks
    embeddings_dict[law] = embeddings

    # Prepare embeddings with unique IDs for each namespace (law)
    vectors_to_upsert = [(f"{law}_id_{i}", embedding.tolist()) for i, embedding in enumerate(embeddings)]
    
    # Clear the namespace first (only if it exists)
    try:
        print(f"Clearing vectors for namespace '{law}'...")
        index.delete(delete_all=True, namespace=law)  # Use delete_all=True to clear all vectors in the namespace
        print(f"Namespace '{law}' cleared.")
    except Exception as e:
        print(f"Namespace '{law}' does not exist or an error occurred: {e}")
    
    # Upsert embeddings to Pinecone with the correct namespace
    try:
        print(f"Upserting vectors for namespace '{law}'...")
        index.upsert(vectors=vectors_to_upsert, namespace=law)
        print(f"Upsert complete for namespace '{law}'.")
    except Exception as e:
        print(f"Failed to upsert vectors for namespace '{law}': {e}")

print("Chunks Dict Content: ")
for key in chunks_dict.keys():
    print(f"Law: {key}, Number of Chunks: {len(chunks_dict[key])}")



Deleting index 'chunk-embeddings-index'...
Waiting for the index to be deleted...
Index 'chunk-embeddings-index' successfully deleted.
Index 'chunk-embeddings-index' created successfully.
Processing gdpr...


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Clearing vectors for namespace 'gdpr'...
Namespace 'gdpr' cleared.
Upserting vectors for namespace 'gdpr'...
Upsert complete for namespace 'gdpr'.
Processing ai_act...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Clearing vectors for namespace 'ai_act'...
Namespace 'ai_act' cleared.
Upserting vectors for namespace 'ai_act'...
Upsert complete for namespace 'ai_act'.
Processing dma...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Clearing vectors for namespace 'dma'...
Namespace 'dma' cleared.
Upserting vectors for namespace 'dma'...
Upsert complete for namespace 'dma'.
Processing dsa...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Clearing vectors for namespace 'dsa'...
Namespace 'dsa' cleared.
Upserting vectors for namespace 'dsa'...
Upsert complete for namespace 'dsa'.
Chunks Dict Content: 
Law: gdpr, Number of Chunks: 239
Law: ai_act, Number of Chunks: 399
Law: dma, Number of Chunks: 289
Law: dsa, Number of Chunks: 289


In [9]:
# Query each law and print results
for law, info in laws_info.items():
    print(f"\nQuerying {law.upper()} index:")
    query_embedding = embed_query(info['query'], model_name)
    print(info['query'])
    # Pass the 'namespace' parameter
    results = query_pinecone_db(query_embedding, index, namespace=law, top_k=1)  # Add 'namespace=law'
    
    if results and 'matches' in results and results['matches']:
        retrieved_chunk_id = results['matches'][0]['id']
        retrieved_chunk = chunks_dict[law][int(retrieved_chunk_id.split('_')[-1])]
        # Print the chunk number and law name
        print(f"Retrieved chunk {retrieved_chunk_id.split('_')[-1]} from {law.upper()}:")
        print(retrieved_chunk)
    else:
        print(f"No results found for {law.upper()}.")


Querying GDPR index:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the key considerations for Member States when reconciling the right to freedom of expression and information with the right to the protection of personal data under this Regulation, and how should exemptions and derogations be applied in this context?
Retrieved chunk 91 from GDPR:
Where such exemptions or derogations differ from one Member State to another, the law of the Member State to which the controller is subject should apply. In order to take account of the importance of the right to freedom of expression in every democratic society, it is necessary to interpret notions relating to that freedom, such as journalism, broadly. (154) This Regulation allows the principle of public access to official documents to be taken into account when applying this Regulation. Public access to official documents may be considered to be in the public interest. Personal data in documents held by a public authority or a public body should be able to be publicly disclosed by that authority o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the implications of the proposed Regulation on the placement and use of high-risk AI systems with respect to existing Union laws, particularly in areas such as data protection, consumer rights, employment, and national labor laws?
Retrieved chunk 44 from AI_ACT:
AI systems identified as high-risk should be limited to those that have a significant harmful impact on the health, safety and fundamental rights of persons in the Union and such limitation should minimise any potential restriction to international trade. (47) AI systems could have an adverse impact on the health and safety of persons, in particular when such systems operate as safety components of products. Consistent with the objectives of Union harmonisation legislation to facilitate the free movement of products in the internal market and to ensure that only safe and otherwise compliant products find their way into the market, it is important that the safety risks that may be generated by a product as a whole due t

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the key steps and responsibilities of the Commission in addressing and remedying infringements by very large online platforms and search engines according to the text provided?
Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.

Querying DSA index:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What distinguishes online platforms from other providers of hosting services according to the regulation, and why are cloud computing and web-hosting services generally not considered online platforms?
Retrieved chunk 21 from DSA:
Likewise, services used for communications purposes, and the technical means of their delivery, have also evolved considerably, giving rise to online services such as Voice over IP, messaging services and web-based email services, where the communication is delivered via an internet access service. Those services, too, can benefit from the exemptions from liability, to the extent that they qualify as ‘mere conduit’, ‘caching’ or ‘hosting’ services. (29) Intermediary services span a wide range of economic activities which take place online and that develop continually to provide for transmission of information that is swift, safe and secure, and to ensure convenience of all participants of the online ecosystem. For example, ‘mere conduit’ intermediary services

In [10]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

# Load BERT model and tokenizer for cosine similarity
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Load SentenceTransformer model for semantic similarity
semantic_model = SentenceTransformer('distilbert-base-uncased')

# Function to generate embeddings using BERT for cosine similarity
def generate_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] token embedding
    return embedding

# Function to calculate cosine similarity
def calculate_cosine_similarity(reference_embedding, retrieved_embedding):
    return cosine_similarity(reference_embedding.reshape(1, -1), retrieved_embedding.reshape(1, -1))[0][0]

# Function to calculate semantic similarity using Sentence-Transformers
def calculate_semantic_similarity(reference_text, retrieved_text, model):
    embeddings1 = model.encode(reference_text, convert_to_tensor=True)
    embeddings2 = model.encode(retrieved_text, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return similarity.item()

# Reference answers for each law
reference_answers = {
    'gdpr': "Member States law should reconcile the rules governing freedom of expression and information, including journalistic, academic, artistic and or literary expression with the right to the protection of personal data pursuant to this Regulation. The processing of personal data solely for journalistic purposes, or for the purposes of academic, artistic or literary expression should be subject to derogations or exemptions from certain provisions of this Regulation if necessary to reconcile the right to the protection of personal data with the right to freedom of expression and information, as enshrined in Article 11 of the Charter. This should apply in particular to the processing of personal data in the audiovisual field and in news archives and press libraries. Therefore, Member States should adopt legislative measures which lay down the exemptions and derogations necessary for the purpose of balancing those fundamental rights. Member States should adopt such exemptions and derogations on general principles, the rights of the data subject, the controller and the processor, the transfer of personal data to third countries or international organisations, the independent supervisory authorities, cooperation and consistency, and specific data-processing situations. Where such exemptions or derogations differ from one Member State to another, the law of the Member State to which the controller is subject should apply. In order to take account of the importance of the right to freedom of expression in every democratic society, it is necessary to interpret notions relating to that freedom, such as journalism, broadly.",
    'ai_act': "Harmonised rules applicable to the placing on the market, the putting into service and the use of high-risk AI systems should be laid down consistently with Regulation (EC) No 765/2008 of the European Parliament and of the Council (7), Decision No 768/2008/EC of the European Parliament and of the Council (8) and Regulation (EU) 2019/1020 of the European Parliament and of the Council (9) (New Legislative Framework). The harmonised rules laid down in this Regulation should apply across sectors and, in line with the New Legislative Framework, should be without prejudice to existing Union law, in particular on data protection, consumer protection, fundamental rights, employment, and protection of workers, and product safety, to which this Regulation is complementary. As a consequence, all rights and remedies provided for by such Union law to consumers, and other persons on whom AI systems may have a negative impact, including as regards the compensation of possible damages pursuant to Council Directive 85/374/EEC (10) remain unaffected and fully applicable. Furthermore, in the context of employment and protection of workers, this Regulation should therefore not affect Union law on social policy and national labour law, in compliance with Union law, concerning employment and working conditions, including health and safety at work and the relationship between employers and workers. This Regulation should also not affect the exercise of fundamental rights as recognised in the Member States and at Union level, including the right or freedom to strike or to take other action covered by the specific industrial relations systems in Member States as well as the right to negotiate, to conclude and enforce collective agreements or to take collective action in accordance with national law. This Regulation should not affect the provisions aiming to improve working conditions in platform work laid down in a Directive of the European Parliament and of the Council on improving working conditions in platform work. Moreover, this Regulation aims to strengthen the effectiveness of such existing rights and remedies by establishing specific requirements and obligations, including in respect of the transparency, technical documentation and record-keeping of AI systems. Furthermore, the obligations placed on various operators involved in the AI value chain under this Regulation should apply without prejudice to national law, in compliance with Union law, having the effect of limiting the use of certain AI systems where such law falls outside the scope of this Regulation or pursues legitimate public interest objectives other than those pursued by this Regulation. For example, national labour law and law on the protection of minors, namely persons below the age of 18, taking into account the UNCRC General Comment No 25 (2021) on children’s rights in relation to the digital environment, insofar as they are not specific to AI systems and pursue other legitimate public interest objectives, should not be affected by this Regulation.",
    'dma': "Given the potential significant societal effects of an infringement of the additional obligations to manage systemic risks that solely apply to very large online platforms and very large online search engines and in order to address those public policy concerns, it is necessary to provide for a system of enhanced supervision of any action undertaken to effectively terminate and remedy infringements of this Regulation. Therefore, once an infringement of one of the provisions of this Regulation that solely apply to very large online platforms or very large online search engines has been ascertained and, where necessary, sanctioned, the Commission should request the provider of such platform or of such search engine to draw a detailed action plan to remedy any effect of the infringement for the future and communicate such action plan within a timeline set by the Commission, to the Digital Services Coordinators, the Commission and the Board. The Commission, taking into account the opinion of the Board, should establish whether the measures included in the action plan are sufficient to address the infringement, taking also into account whether adherence to relevant code of conduct is included among the measures proposed. The Commission should also monitor any subsequent measure taken by the provider of a very large online platform or of a very large online search engine concerned as set out in its action plan, taking into account also an independent audit of the provider. If following the implementation of the action plan the Commission still considers that the infringement has not been fully remedied, or if the action plan has not been provided or is not considered suitable, it should be able to use any investigative or enforcement powers pursuant to this Regulation, including the power to impose periodic penalty payments and initiating the procedure to disable access to the infringing service.",
    'dsa': "Considering the particular characteristics of the services concerned and the corresponding need to make the providers thereof subject to certain specific obligations, it is necessary to distinguish, within the broader category of providers of hosting services as defined in this Regulation, the subcategory of online platforms. Online platforms, such as social networks or online platforms allowing consumers to conclude distance contracts with traders, should be defined as providers of hosting services that not only store information provided by the recipients of the service at their request, but that also disseminate that information to the public at the request of the recipients of the service. However, in order to avoid imposing overly broad obligations, providers of hosting services should not be considered as online platforms where the dissemination to the public is merely a minor and purely ancillary feature that is intrinsically linked to another service, or a minor functionality of the principal service, and that feature or functionality cannot, for objective technical reasons, be used without that other or principal service, and the integration of that feature or functionality is not a means to circumvent the applicability of the rules of this Regulation applicable to online platforms. For example, the comments section in an online newspaper could constitute such a feature, where it is clear that it is ancillary to the main service represented by the publication of news under the editorial responsibility of the publisher. In contrast, the storage of comments in a social network should be considered an online platform service where it is clear that it is not a minor feature of the service offered, even if it is ancillary to publishing the posts of recipients of the service. For the purposes of this Regulation, cloud computing or web-hosting services should not be considered to be an online platform where dissemination of specific information to the public constitutes a minor and ancillary feature or a minor functionality of such services.Moreover, cloud computing services and web-hosting services, when serving as infrastructure, such as the underlying infrastructural storage and computing services of an internet-based application, website or online platform, should not in themselves be considered as disseminating to the public information stored or processed at the request of a recipient of the application, website or online platform which they host."
}


In [11]:
similarities = []

# Query each law and calculate similarities
for law, info in laws_info.items():
    print(f"\nQuerying {law.upper()} index:")
    query_embedding = embed_query(info['query'], model_name)
    print(info['query'])
    # Pass the 'namespace' parameter
    results = query_pinecone_db(query_embedding, index, namespace=law, top_k=1)  # Add 'namespace=law'
    
    if results and 'matches' in results and results['matches']:
        retrieved_chunk_id = results['matches'][0]['id']
        retrieved_chunk = chunks_dict[law][int(retrieved_chunk_id.split('_')[-1])]
        # Print the chunk number and law name
        print(f"Retrieved chunk {retrieved_chunk_id.split('_')[-1]} from {law.upper()}:")
        print(retrieved_chunk)

        # Calculate cosine similarity using BERT embeddings
        retrieved_embedding = generate_bert_embedding(retrieved_chunk, tokenizer, model)
        reference_embedding = generate_bert_embedding(reference_answers[law], tokenizer, model)
        cosine_sim = calculate_cosine_similarity(reference_embedding, retrieved_embedding)
        
        # Calculate semantic similarity using Sentence-Transformers model
        semantic_sim = calculate_semantic_similarity(reference_answers[law], retrieved_chunk, semantic_model)

        # Store the results
        similarities.append({
            'law': law,
            'retrieved_answer': retrieved_chunk,
            'cosine_similarity': cosine_sim,
            'semantic_similarity': semantic_sim
        })

        # Print the results for this law
        print(f"Cosine Similarity with reference answer: {cosine_sim:.4f}")
        print(f"Semantic Similarity with reference answer: {semantic_sim:.4f}")
        print("----\n")
    else:
        print(f"No results found for {law.upper()} in the query.")



Querying GDPR index:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the key considerations for Member States when reconciling the right to freedom of expression and information with the right to the protection of personal data under this Regulation, and how should exemptions and derogations be applied in this context?
Retrieved chunk 91 from GDPR:
Where such exemptions or derogations differ from one Member State to another, the law of the Member State to which the controller is subject should apply. In order to take account of the importance of the right to freedom of expression in every democratic society, it is necessary to interpret notions relating to that freedom, such as journalism, broadly. (154) This Regulation allows the principle of public access to official documents to be taken into account when applying this Regulation. Public access to official documents may be considered to be in the public interest. Personal data in documents held by a public authority or a public body should be able to be publicly disclosed by that authority o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9799
Semantic Similarity with reference answer: 0.9789
----


Querying AI_ACT index:




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the implications of the proposed Regulation on the placement and use of high-risk AI systems with respect to existing Union laws, particularly in areas such as data protection, consumer rights, employment, and national labor laws?
Retrieved chunk 44 from AI_ACT:
AI systems identified as high-risk should be limited to those that have a significant harmful impact on the health, safety and fundamental rights of persons in the Union and such limitation should minimise any potential restriction to international trade. (47) AI systems could have an adverse impact on the health and safety of persons, in particular when such systems operate as safety components of products. Consistent with the objectives of Union harmonisation legislation to facilitate the free movement of products in the internal market and to ensure that only safe and otherwise compliant products find their way into the market, it is important that the safety risks that may be generated by a product as a whole due t

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9625
Semantic Similarity with reference answer: 0.9445
----


Querying DMA index:




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What are the key steps and responsibilities of the Commission in addressing and remedying infringements by very large online platforms and search engines according to the text provided?
Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9333
Semantic Similarity with reference answer: 0.8974
----


Querying DSA index:




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

What distinguishes online platforms from other providers of hosting services according to the regulation, and why are cloud computing and web-hosting services generally not considered online platforms?
Retrieved chunk 21 from DSA:
Likewise, services used for communications purposes, and the technical means of their delivery, have also evolved considerably, giving rise to online services such as Voice over IP, messaging services and web-based email services, where the communication is delivered via an internet access service. Those services, too, can benefit from the exemptions from liability, to the extent that they qualify as ‘mere conduit’, ‘caching’ or ‘hosting’ services. (29) Intermediary services span a wide range of economic activities which take place online and that develop continually to provide for transmission of information that is swift, safe and secure, and to ensure convenience of all participants of the online ecosystem. For example, ‘mere conduit’ intermediary services

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9428
Semantic Similarity with reference answer: 0.9137
----



In [13]:
# Extend the laws_info dictionary to include multiple questions and answers for GDPR, AI Act, DMA, and DSA
integrated_questions_answers = [
    # Question 1 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the fundamental right regarding the processing of personal data as per the Charter of Fundamental Rights of the European Union?",
        'answer': "The protection of natural persons in relation to the processing of personal data is a fundamental right. Article 8(1) of the Charter of Fundamental Rights of the European Union (‘the Charter’) and Article 16(1) of the Treaty on the Functioning of the European Union (TFEU) provide that everyone has the right to the protection of personal data concerning them. This Regulation is intended to contribute to the accomplishment of an area of freedom, security, and justice and of an economic union, to economic and social progress, to the strengthening and the convergence of the economies within the internal market, and to the well-being of natural persons."
    },
    # Question 1 from AI Act
    {
        'law': 'ai_act',
        'question': "What are the main objectives of the AI Act concerning the development and use of AI in the European Union?",
        'answer': "The AI Act aims to ensure that AI systems placed on the market and used in the Union are safe, respect existing law on fundamental rights and Union values, and do not undermine fundamental rights. The Act aims to establish a legal framework that addresses the risks posed by AI, in particular high-risk AI systems, and aims to enhance transparency, accountability, and trust in AI while promoting innovation and competitiveness."
    },
    # Question 1 from DMA
    {
        'law': 'dma',
        'question': "What criteria are used to define a 'gatekeeper' under the Digital Markets Act?",
        'answer': "A gatekeeper under the DMA is defined as a provider of core platform services that has a significant impact on the internal market, serves as an important gateway for business users to reach end users, and enjoys an entrenched and durable position in the market. The criteria include having a strong economic position, a large number of users, and control over an ecosystem that is difficult for other companies to contest."
    },
    # Question 1 from DSA
    {
        'law': 'dsa',
        'question': "What are the main responsibilities of online platforms under the Digital Services Act?",
        'answer': "Under the DSA, online platforms are responsible for taking effective measures to mitigate risks related to illegal content, ensure the safety of users, and protect fundamental rights. Platforms must implement mechanisms for reporting and removing illegal content, provide users with clear terms and conditions, and establish processes for handling complaints and appeals. Platforms that reach a significant number of users are also required to assess and mitigate systemic risks, such as the spread of disinformation and harmful content."
    },
    # Question 2 from GDPR
    {
        'law': 'gdpr',
        'question': "How does GDPR aim to balance the right to the protection of personal data with other fundamental rights?",
        'answer': "This Regulation respects all fundamental rights and observes the freedoms and principles recognized in the Charter as enshrined in the Treaties, in particular the respect for private and family life, home and communications, the protection of personal data, freedom of thought, conscience and religion, freedom of expression and information, freedom to conduct a business, the right to an effective remedy and to a fair trial, and cultural, religious and linguistic diversity. The right to the protection of personal data must be considered in relation to its function in society and be balanced against other fundamental rights, in accordance with the principle of proportionality."
    },
    # Question 2 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act propose to regulate high-risk AI systems?",
        'answer': "The AI Act classifies AI systems based on the risk they pose and subjects high-risk AI systems to strict requirements. High-risk AI systems include those used in critical infrastructure, education, employment, essential public and private services, law enforcement, and migration, asylum, and border control management. These systems must comply with requirements related to risk management, data governance, technical documentation, record-keeping, transparency, provision of information to users, human oversight, accuracy, and robustness. Providers of these systems must establish a quality management system and ensure continuous monitoring and post-market surveillance."
    },
    # Question 2 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA propose to regulate the behavior of gatekeepers in digital markets?",
        'answer': "The DMA imposes specific obligations on gatekeepers to prevent them from engaging in unfair practices that harm competition and consumers. This includes prohibiting gatekeepers from favoring their own services over those of competitors (self-preferencing), requiring them to allow interoperability with third-party services, and ensuring that they do not unfairly limit access to their platforms. Gatekeepers are also required to provide data portability, offer fair terms to business users, and ensure transparency in their operations."
    },
    # Question 2 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA aim to protect users from illegal content on digital platforms?",
        'answer': "The DSA aims to protect users from illegal content by requiring platforms to implement notice-and-action mechanisms, allowing users to report illegal content easily. Platforms must act expeditiously to remove or disable access to illegal content upon receiving a notice. The DSA also introduces obligations for platforms to cooperate with law enforcement and provide transparency reports on their content moderation activities. Platforms must take proactive measures to prevent the spread of illegal content and ensure that their algorithms do not promote harmful or illegal content."
    },
    # Question 3 from GDPR
    {
        'law': 'gdpr',
        'question': "What challenges have arisen due to technological developments and globalization in the context of personal data protection?",
        'answer': "Technological developments and globalization have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information available publicly and globally. Technology has transformed both the economy and social life, and should further facilitate the free flow of personal data within the Union and the transfer to third countries and international organizations, while ensuring a high level of the protection of personal data."
    },
    # Question 3 from AI Act
    {
        'law': 'ai_act',
        'question': "What responsibilities does the AI Act place on AI providers to ensure ethical AI practices?",
        'answer': "Providers of high-risk AI systems are responsible for ensuring that their systems comply with the requirements set out in the Act. This includes the obligation to conduct a conformity assessment before placing the system on the market, ensure the system undergoes proper testing, provide clear instructions and information to users, implement human oversight measures, and monitor the system throughout its lifecycle. Providers must also report serious incidents and malfunctions to the authorities."
    },
    # Question 3 from DMA
    {
        'law': 'dma',
        'question': "What are the key obligations imposed on gatekeepers by the DMA?",
        'answer': "The key obligations for gatekeepers under the DMA include prohibitions on combining personal data from different sources without user consent, restrictions on pre-installing software or apps, and requirements to allow business users access to data generated on their platform. Gatekeepers must also ensure that their platforms are open and interoperable with third-party services, and they are prohibited from using non-public data from their business users to compete against them."
    },
    # Question 3 from DSA
    {
        'law': 'dsa',
        'question': "What transparency requirements are imposed on online platforms by the DSA?",
        'answer': "The DSA imposes extensive transparency requirements on online platforms, including the obligation to publish transparency reports detailing the number of content removal actions, the reasons for these actions, and the outcomes of user appeals. Platforms must also disclose how their content moderation systems and recommendation algorithms work, including the criteria used to rank and display content. Users must be informed about the terms and conditions governing the use of the platform and any changes made to these terms. Additionally, platforms must provide clear information about the advertising they serve, including the identity of advertisers and the targeting criteria used."
    },
    # Question 4 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR address the transfer of personal data to third countries or international organizations?",
        'answer': "The transfer of personal data to third countries or international organizations is allowed only where the conditions laid down in this Regulation are met, in order to ensure that the level of protection of natural persons guaranteed by this Regulation is not undermined. In any event, transfers to third countries and international organizations may only be carried out in full compliance with this Regulation. This Regulation is without prejudice to international agreements concluded between the Union and third countries regulating the transfer of personal data, including appropriate safeguards for the data subjects."
    },
    # Question 4 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act address transparency and accountability in AI systems?",
        'answer': "The AI Act mandates that AI systems, particularly high-risk ones, must be transparent and provide clear information about their purpose, capabilities, and limitations. Users should be able to understand how decisions are made by AI systems and what data is being processed. The Act requires that AI systems be designed with features that ensure accountability, including auditability, traceability of decisions, and the ability to provide explanations for decisions made by the AI."
    },
    # Question 4 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA aim to prevent unfair practices in the digital market?",
        'answer': "The DMA aims to prevent unfair practices by setting out clear rules for gatekeepers, including prohibitions on self-preferencing, restrictions on unfair terms and conditions for business users, and requirements for transparency in how they operate. The DMA also ensures that gatekeepers cannot use their dominant position to stifle competition or innovation by smaller firms. The European Commission is empowered to investigate and sanction gatekeepers that do not comply with these rules."
    },
    # Question 4 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA propose to handle the dissemination of harmful content?",
        'answer': "The DSA proposes to handle the dissemination of harmful content by requiring platforms to assess the risks associated with the dissemination of harmful or illegal content and to take appropriate measures to mitigate these risks. Platforms must implement safeguards to ensure that their algorithms do not promote harmful content, and they must provide users with tools to control the content they are exposed to. The DSA also encourages platforms to cooperate with trusted flaggers and fact-checkers to identify and address harmful content more effectively. In cases where platforms fail to mitigate risks adequately, they may be subject to regulatory action, including fines and other penalties."
    },
    # Question 5 from GDPR
    {
        'law': 'gdpr',
        'question': "What specific protections does GDPR offer to children regarding their personal data?",
        'answer': "Children merit specific protection with regard to their personal data, as they may be less aware of the risks, consequences, safeguards, and rights in relation to the processing of personal data. Such specific protection should, in particular, apply to the use of personal data of children for the purposes of marketing or creating personality or user profiles and the collection of personal data with regard to children when using services offered directly to a child. The consent of the holder of parental responsibility should not be necessary in the context of preventive or counselling services offered directly to a child."
    },
    # Question 5 from AI Act
    {
        'law': 'ai_act',
        'question': "What measures are suggested by the AI Act to protect fundamental rights in the deployment of AI technologies?",
        'answer': "The AI Act incorporates several measures to protect fundamental rights, such as requiring AI systems to be designed and used in a manner that is consistent with respect for human dignity, privacy, non-discrimination, and other fundamental rights. This includes embedding human oversight mechanisms, ensuring that AI systems do not lead to biased or discriminatory outcomes, and providing avenues for individuals to contest decisions made by AI systems that affect them significantly. The Act also promotes the development of codes of conduct and voluntary measures by providers to ensure that AI is used ethically and in alignment with societal values."
    },
    # Question 5 from DMA
    {
        'law': 'dma',
        'question': "What enforcement mechanisms are included in the DMA to ensure compliance by gatekeepers?",
        'answer': "The DMA includes robust enforcement mechanisms, such as the ability for the European Commission to impose fines of up to 10% of the gatekeeper’s total worldwide annual turnover for non-compliance. In cases of repeated infringements, the Commission can impose additional penalties, including structural remedies, such as the divestiture of businesses. The DMA also allows for periodic penalty payments to ensure that gatekeepers comply with the obligations and prohibitions set out in the regulation."
    },
    # Question 5 from DSA
    {
        'law': 'dsa',
        'question': "What measures does the DSA include to protect freedom of expression while combating illegal content?",
        'answer': "The DSA includes measures to protect freedom of expression by ensuring that any restrictions on content are necessary, proportionate, and legally justified. Platforms must provide users with clear explanations when content is removed or access is restricted, and users must have the right to appeal such decisions. The DSA also requires platforms to ensure that content moderation processes are fair and transparent, with safeguards in place to prevent the arbitrary removal of content. In addition, the DSA encourages platforms to develop codes of conduct in collaboration with stakeholders to balance the need to combat illegal content with the protection of free speech."
    },
    # Question 6 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR define personal data, and what are some examples?",
        'answer': "Personal data under the GDPR is defined as any information relating to an identified or identifiable natural person (‘data subject’). Examples include a person’s name, identification number, location data, online identifier, or one or more factors specific to the physical, physiological, genetic, mental, economic, cultural, or social identity of that natural person. The definition is broad, capturing various forms of data that could be used to directly or indirectly identify an individual."
    },
    # Question 6 from AI Act
    {
        'law': 'ai_act',
        'question': "What categories of AI systems are considered high-risk under the AI Act?",
        'answer': "High-risk AI systems under the AI Act include those used in critical infrastructure (such as transport, energy, and water supply), educational and vocational training, employment and worker management, access to essential private and public services (such as credit scoring and social benefits), law enforcement (such as predictive policing), migration, asylum, and border control management, and administration of justice and democratic processes. These systems are subject to stringent requirements due to the significant risks they pose to fundamental rights and safety."
    },
    # Question 6 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of self-preferencing by gatekeepers?",
        'answer': "The DMA specifically prohibits gatekeepers from engaging in self-preferencing practices, where they favor their own products or services over those of competitors on their platforms. This includes practices such as ranking their own products higher in search results or giving preferential access to data. The aim is to ensure a level playing field in digital markets, where competition is based on merit rather than the market power of the gatekeeper. The prohibition on self-preferencing is one of the key obligations imposed on gatekeepers to prevent anti-competitive behavior."
    },
    # Question 6 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA address the issue of content moderation on online platforms?",
        'answer': "The DSA requires online platforms to implement content moderation policies that are transparent, consistent, and aligned with fundamental rights. Platforms must establish clear terms and conditions for content moderation and provide users with detailed information on how content is assessed, removed, or restricted. The DSA also mandates that platforms implement mechanisms for users to appeal content moderation decisions, ensuring that users have the opportunity to contest unjustified removals or restrictions. These measures aim to create a fair and accountable content moderation system that respects freedom of expression while combating illegal content."
    },
    # Question 7 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the legal basis for processing personal data under the GDPR?",
        'answer': "The GDPR outlines several legal bases for processing personal data, including: the data subject has given consent to the processing; processing is necessary for the performance of a contract to which the data subject is a party; processing is necessary for compliance with a legal obligation; processing is necessary to protect the vital interests of the data subject or another natural person; processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority; and processing is necessary for the purposes of the legitimate interests pursued by the controller or a third party, except where such interests are overridden by the interests or fundamental rights and freedoms of the data subject."
    },
    # Question 7 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act define 'AI system' and what technologies fall under this definition?",
        'answer': "The AI Act defines an 'AI system' as software that is developed with one or more of the techniques and approaches listed in the Act, such as machine learning, logic- and knowledge-based approaches, and statistical approaches. These systems can, for a given set of human-defined objectives, generate outputs such as content, predictions, recommendations, or decisions influencing the environments they interact with. The definition is broad and includes a variety of AI technologies, from simple algorithms to complex machine learning models."
    },
    # Question 7 from DMA
    {
        'law': 'dma',
        'question': "What are the criteria for identifying core platform services under the DMA?",
        'answer': "Core platform services under the DMA include a range of digital services that serve as important gateways for business users to reach end users. These services include online intermediation services, such as app stores and marketplaces, online search engines, social networking services, video-sharing platform services, number-independent interpersonal communication services, operating systems, cloud computing services, and advertising services. A service is considered a core platform service if it has a significant impact on the internal market and is an essential gateway for business users to access end users."
    },
    # Question 7 from DSA
    {
        'law': 'dsa',
        'question': "What obligations do very large online platforms (VLOPs) have under the DSA?",
        'answer': "VLOPs, defined as platforms with more than 45 million users in the EU, have additional obligations under the DSA due to their significant impact on society and public discourse. VLOPs must conduct annual risk assessments to identify and mitigate systemic risks, such as the dissemination of illegal content, disinformation, and harmful content. They are also required to provide greater transparency in their content recommendation algorithms, offer users more control over the content they see, and cooperate with authorities to prevent and address systemic risks. These obligations are intended to ensure that VLOPs operate in a manner that is safe, transparent, and respectful of fundamental rights."
    },
    # Question 8 from GDPR
    {
        'law': 'gdpr',
        'question': "What are the rights of data subjects under the GDPR?",
        'answer': "The GDPR grants data subjects several rights, including the right to be informed, the right of access, the right to rectification, the right to erasure (‘right to be forgotten’), the right to restrict processing, the right to data portability, the right to object to processing, and rights in relation to automated decision-making and profiling. These rights empower individuals to have control over their personal data and ensure transparency and accountability in data processing."
    },
    # Question 8 from AI Act
    {
        'law': 'ai_act',
        'question': "What obligations do users of high-risk AI systems have under the AI Act?",
        'answer': "Users of high-risk AI systems are required to operate the systems in accordance with the instructions provided by the AI system provider, monitor the operation of the AI system, and promptly report any serious incidents or malfunctions to the provider and the competent authorities. Users must also keep logs generated by the AI system, ensure that human oversight is maintained, and ensure that the AI system is used only for its intended purpose. Additionally, users are responsible for implementing measures to mitigate risks to fundamental rights and safety."
    },
    # Question 8 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA promote interoperability between digital services?",
        'answer': "The DMA promotes interoperability by requiring gatekeepers to ensure that their core platform services can interact with third-party services. This includes making available the necessary technical interfaces and documentation to allow for interoperability. The goal is to prevent gatekeepers from locking in users and business users to their platforms and to enable competition by allowing new entrants and smaller competitors to offer complementary or competing services. Interoperability is seen as a key measure to promote innovation and consumer choice in digital markets."
    },
    # Question 8 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA enhance the protection of minors online?",
        'answer': "The DSA includes specific provisions to enhance the protection of minors online, recognizing that children are particularly vulnerable to harmful content and practices. Platforms must implement measures to ensure that their services are safe for minors, including age-appropriate content moderation, parental controls, and restrictions on targeted advertising to minors. The DSA also requires platforms to provide clear and accessible information to minors and their parents about the risks associated with online activities and how to protect themselves. These measures are designed to create a safer online environment for children and to empower them and their guardians to make informed decisions."
    },
    # Question 9 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR address data protection by design and by default?",
        'answer': "The GDPR requires data controllers to implement data protection by design and by default. This means that data protection measures must be integrated into the processing activities from the outset and that only personal data necessary for each specific purpose of the processing is processed. The controller must take appropriate technical and organizational measures, such as pseudonymization, to ensure that, by default, personal data is not made accessible to an indefinite number of people without the individual's consent."
    },
    # Question 9 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act address the use of biometric identification systems?",
        'answer': "The AI Act imposes strict regulations on the use of biometric identification systems, particularly those used in public spaces for law enforcement purposes. The use of real-time remote biometric identification systems in publicly accessible spaces is generally prohibited, with exceptions granted under specific conditions, such as preventing a terrorist attack, locating a missing child, or identifying a suspect of a serious crime. Even in these cases, the use must be authorized by judicial or other independent authorities, and subject to strict safeguards to protect fundamental rights."
    },
    # Question 9 from DMA
    {
        'law': 'dma',
        'question': "What obligations does the DMA impose on gatekeepers regarding data access and portability?",
        'answer': "The DMA imposes obligations on gatekeepers to provide business users and end users with access to the data generated through their interactions on the platform. This includes providing data in a structured, commonly used, and machine-readable format to facilitate data portability. Gatekeepers are also required to allow business users to access data that is necessary for the development and improvement of their own products and services. These obligations are intended to prevent gatekeepers from using their control over data to stifle competition and innovation."
    },
    # Question 9 from DSA
    {
        'law': 'dsa',
        'question': "What are the transparency obligations for online platforms regarding their algorithms?",
        'answer': "The DSA imposes transparency obligations on online platforms to provide clear and accessible information about how their algorithms work, particularly those used for content moderation, recommendation, and ranking. Platforms must explain the criteria and logic behind their algorithms, allowing users to understand how decisions are made and how content is presented to them. VLOPs have additional obligations to conduct algorithmic audits and to allow independent researchers to assess the impact of their algorithms on society. These transparency measures are intended to increase accountability and trust in the digital ecosystem."
    },
    # Question 10 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the role of the Data Protection Officer (DPO) under the GDPR?",
        'answer': "The Data Protection Officer (DPO) is responsible for overseeing data protection strategies and ensuring compliance with GDPR requirements. The DPO must be appointed by public authorities and bodies, and by organizations that engage in regular and systematic monitoring of data subjects on a large scale or process special categories of data on a large scale. The DPO’s responsibilities include advising the organization on GDPR obligations, monitoring compliance, providing training to staff, conducting audits, and serving as the contact point for supervisory authorities and data subjects."
    },
    # Question 10 from AI Act
    {
        'law': 'ai_act',
        'question': "What are the requirements for conformity assessments under the AI Act?",
        'answer': "High-risk AI systems must undergo a conformity assessment before they can be placed on the market or put into service. This assessment involves evaluating whether the AI system meets the requirements set out in the AI Act, including risk management, data governance, transparency, human oversight, and accuracy. The assessment can be conducted by the provider or by a notified body, depending on the nature of the AI system. The conformity assessment must be documented, and the AI system must bear a CE marking indicating compliance with the regulation."
    },
    # Question 10 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of tying and bundling practices by gatekeepers?",
        'answer': "The DMA prohibits gatekeepers from engaging in tying and bundling practices that require users to purchase or use additional services as a condition for accessing the gatekeeper's core platform service. For example, a gatekeeper cannot require users to install or use a specific app or service as a precondition for using their platform. The prohibition on tying and bundling is intended to prevent gatekeepers from leveraging their market power to extend their dominance into other markets and to ensure that users have the freedom to choose the services they want to use."
    },
    # Question 10 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA address the issue of disinformation and fake news on digital platforms?",
        'answer': "The DSA requires platforms, particularly VLOPs, to take proactive measures to combat the spread of disinformation and fake news. This includes implementing mechanisms to detect, assess, and mitigate the risks associated with disinformation, collaborating with independent fact-checkers, and providing users with accurate information and context. Platforms must also ensure that their content moderation and recommendation systems do not amplify or promote disinformation. The DSA promotes transparency by requiring platforms to report on their efforts to combat disinformation and to provide users with tools to identify and report false information."
    },
    # Question 11 from GDPR
    {
        'law': 'gdpr',
        'question': "What are the implications of the GDPR for cross-border data processing activities?",
        'answer': "The GDPR establishes a framework for cross-border data processing activities to ensure that data protection is consistent across the EU. Organizations that process personal data across multiple EU member states must designate a lead supervisory authority, which acts as the single point of contact for overseeing compliance. The GDPR also facilitates cooperation between supervisory authorities through mechanisms such as the consistency mechanism and the European Data Protection Board (EDPB)."
    },
    # Question 11 from AI Act
    {
        'law': 'ai_act',
        'question': "What role do national supervisory authorities play under the AI Act?",
        'answer': "National supervisory authorities are responsible for overseeing the implementation and enforcement of the AI Act within their respective jurisdictions. They are tasked with monitoring the compliance of AI systems with the Act's requirements, conducting inspections and investigations, and taking enforcement actions where necessary. These authorities also play a key role in coordinating with other national authorities and the European Commission to ensure a harmonized approach to AI regulation across the EU."
    },
    # Question 11 from DMA
    {
        'law': 'dma',
        'question': "What are the consequences for gatekeepers that fail to comply with the DMA?",
        'answer': "Gatekeepers that fail to comply with the obligations and prohibitions set out in the DMA face significant consequences, including fines of up to 10% of their total worldwide annual turnover. In cases of repeated non-compliance, the European Commission can impose additional measures, such as structural remedies, including the divestiture of parts of the business. The DMA also provides for periodic penalty payments to ensure that gatekeepers comply with the obligations on an ongoing basis. The enforcement of the DMA is designed to be robust to prevent gatekeepers from engaging in anti-competitive behavior."
    },
    # Question 11 from DSA
    {
        'law': 'dsa',
        'question': "What role do trusted flaggers play under the DSA?",
        'answer': "The DSA recognizes the role of trusted flaggers—entities with expertise in identifying illegal content—as important partners in content moderation. Trusted flaggers are granted priority in the notice-and-action mechanisms, meaning that their reports are processed more quickly and with higher accuracy. Platforms must ensure that trusted flaggers' reports are handled by experienced moderators and that they receive feedback on the actions taken. The designation of trusted flaggers is intended to improve the efficiency and effectiveness of content moderation, particularly in combating illegal content and harmful activities online."
    },
    # Question 12 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR handle data breaches, and what are the obligations of data controllers in such cases?",
        'answer': "Under the GDPR, data controllers are required to report data breaches to the relevant supervisory authority within 72 hours of becoming aware of the breach, unless the breach is unlikely to result in a risk to the rights and freedoms of individuals. If the breach poses a high risk to the affected individuals, the data controller must also inform the data subjects without undue delay. The GDPR mandates that organizations implement appropriate technical and organizational measures to prevent data breaches and mitigate their impact."
    },
    # Question 12 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act encourage innovation while ensuring safety and compliance?",
        'answer': "The AI Act encourages innovation by providing regulatory sandboxes, which are controlled environments where AI developers can test their systems under the supervision of competent authorities without immediately facing the full regulatory requirements. These sandboxes allow for experimentation and development of innovative AI solutions while ensuring that safety, ethical, and legal standards are maintained. The Act also promotes the adoption of voluntary codes of conduct for non-high-risk AI systems, allowing providers to demonstrate their commitment to ethical AI practices."
    },
    # Question 12 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA enhance consumer protection in digital markets?",
        'answer': "The DMA enhances consumer protection by ensuring that gatekeepers do not engage in practices that harm consumers, such as self-preferencing, unfair terms and conditions, or limiting access to data. The DMA also promotes transparency in how gatekeepers operate, requiring them to provide clear and accessible information to consumers about their practices. Additionally, the DMA ensures that consumers have more choice and control over the digital services they use, by promoting interoperability and data portability. By fostering competition, the DMA aims to improve the quality and affordability of digital services for consumers."
    },
    # Question 12 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA promote the accountability of online platforms?",
        'answer': "The DSA promotes accountability by imposing rigorous reporting and transparency requirements on online platforms. Platforms must publish regular transparency reports detailing their content moderation activities, including the number of removal actions, reasons for removals, and outcomes of user appeals. VLOPs are also required to undergo independent audits of their content moderation and risk management practices. These audits are intended to assess the platform's compliance with the DSA and to identify areas for improvement. By promoting transparency and accountability, the DSA aims to build trust in the digital environment and ensure that platforms act responsibly."
    },
    # Question 13 from GDPR
    {
        'law': 'gdpr',
        'question': "What are the restrictions on processing special categories of personal data under the GDPR?",
        'answer': "The GDPR imposes stricter rules on processing special categories of personal data, such as data revealing racial or ethnic origin, political opinions, religious or philosophical beliefs, trade union membership, genetic data, biometric data, health data, and data concerning a person’s sex life or sexual orientation. Processing of such data is prohibited unless specific conditions are met, such as obtaining explicit consent from the data subject, fulfilling legal obligations in the field of employment and social security, or protecting the vital interests of the data subject."
    },
    # Question 13 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act address the transparency of AI systems?",
        'answer': "The AI Act mandates that AI systems, particularly high-risk ones, be designed and developed with transparency in mind. This includes providing clear and accessible information to users about the AI system’s purpose, capabilities, limitations, and how it functions. Users must be informed when they are interacting with an AI system, especially in cases where the AI is used to make decisions with significant impacts on individuals. The transparency requirements are aimed at ensuring that users and affected individuals understand how and why decisions are made by AI systems."
    },
    # Question 13 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of access to business users' data by gatekeepers?",
        'answer': "The DMA imposes obligations on gatekeepers to provide business users with access to the data they generate through their interactions on the platform. This includes access to aggregated and anonymized data, as well as data that is essential for the development and improvement of the business user's products and services. The DMA also prohibits gatekeepers from using non-public data from business users to compete against them, ensuring that gatekeepers do not exploit their access to data to gain an unfair competitive advantage."
    },
    # Question 13 from DSA
    {
        'law': 'dsa',
        'question': "What are the penalties for non-compliance with the DSA?",
        'answer': "The DSA provides for substantial penalties for non-compliance, including fines of up to 6% of the platform's total worldwide annual turnover. In cases of repeated or severe non-compliance, the DSA allows for additional measures, such as temporary suspension of the platform's services or other corrective actions. The enforcement of the DSA is overseen by national regulatory authorities, which have the power to investigate and sanction platforms that violate the regulation. These penalties are designed to ensure that platforms take their obligations seriously and that the DSA's provisions are effectively implemented."
    },
    # Question 14 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR regulate automated decision-making and profiling?",
        'answer': "The GDPR places restrictions on automated decision-making, including profiling, where decisions are made solely based on automated processing and significantly affect individuals. Such processing is permitted only in specific situations, such as when it is necessary for entering into or performing a contract, authorized by Union or Member State law, or based on the data subject’s explicit consent. Organizations must ensure that individuals are informed about the existence of automated decision-making, the logic involved, and the potential consequences. Data subjects have the right to contest automated decisions and seek human intervention."
    },
    # Question 14 from AI Act
    {
        'law': 'ai_act',
        'question': "What are the obligations related to data quality under the AI Act?",
        'answer': "The AI Act requires that high-risk AI systems be trained, tested, and validated using high-quality datasets that are relevant, representative, free of errors, and complete. The data must be carefully selected to avoid biases that could lead to discriminatory outcomes. Providers must ensure that the data governance framework includes measures to assess and mitigate risks related to data quality, such as using diverse and representative datasets, validating the accuracy and reliability of data, and regularly updating datasets to reflect changes over time."
    },
    # Question 14 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA ensure fair and non-discriminatory access to core platform services?",
        'answer': "The DMA requires gatekeepers to ensure that their core platform services are offered on fair, reasonable, and non-discriminatory terms. This means that gatekeepers cannot impose unfair terms or conditions on business users or engage in practices that favor their own services over those of competitors. The DMA also requires gatekeepers to provide transparency in how they operate, including clear and accessible information about the terms and conditions for using their services. These measures are intended to prevent gatekeepers from abusing their market power and to ensure a level playing field in digital markets."
    },
    # Question 14 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA address the issue of illegal goods, services, and content online?",
        'answer': "The DSA requires platforms to implement measures to detect and remove illegal goods, services, and content from their services. This includes ensuring that sellers and service providers on their platforms are properly identified and that they comply with applicable laws and regulations. Platforms must also provide users with clear mechanisms to report illegal goods and services, and they must act expeditiously to remove or disable access to such content. The DSA's provisions are designed to protect consumers and ensure that online marketplaces operate in a safe and lawful manner."
    },
    # Question 15 from GDPR
    {
        'law': 'gdpr',
        'question': "What penalties and enforcement actions are provided for under the GDPR?",
        'answer': "The GDPR provides for substantial penalties and enforcement actions to ensure compliance. Supervisory authorities have the power to impose administrative fines of up to 20 million euros or 4% of the total worldwide annual turnover of the preceding financial year, whichever is higher, for the most serious violations. Penalties are determined based on factors such as the nature, gravity, and duration of the infringement, the intentional or negligent character of the infringement, and the measures taken by the organization to mitigate the damage."
    },
    # Question 15 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act regulate the use of AI in law enforcement and public safety?",
        'answer': "The AI Act imposes strict regulations on the use of AI systems in law enforcement and public safety, particularly those used for predictive policing, biometric identification, and surveillance. These systems are considered high-risk and are subject to rigorous scrutiny to ensure that they do not infringe on fundamental rights, such as privacy and non-discrimination. Law enforcement agencies must conduct a detailed risk assessment and implement safeguards to ensure that the use of AI systems is necessary, proportionate, and respectful of human rights."
    },
    # Question 15 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA promote innovation and competition in digital markets?",
        'answer': "The DMA promotes innovation and competition by preventing gatekeepers from engaging in practices that stifle competition, such as self-preferencing, tying, and bundling. By ensuring that gatekeepers operate on fair, reasonable, and non-discriminatory terms, the DMA creates opportunities for new entrants and smaller competitors to compete on a level playing field. The DMA also promotes interoperability and data portability, enabling businesses to develop innovative services that can interact with the gatekeeper's platform. These measures are designed to foster a dynamic and competitive digital market that benefits consumers and businesses alike."
    },
    # Question 15 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA support the rights of consumers in the digital marketplace?",
        'answer': "The DSA strengthens consumer rights by ensuring that online platforms provide clear and accessible information about the goods, services, and content available on their platforms. This includes requiring platforms to disclose information about the identity of sellers, the terms and conditions of transactions, and the nature of the goods and services offered. Consumers must also be informed about their rights, including the right to withdraw from a transaction, the right to a refund, and the right to access effective dispute resolution mechanisms. The DSA's consumer protection provisions are designed to create a safe and transparent digital marketplace."
    },
    # Question 16 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the role of the European Data Protection Board (EDPB) under the GDPR?",
        'answer': "The European Data Protection Board (EDPB) is an independent body established by the GDPR to ensure the consistent application of data protection rules across the EU. The EDPB is composed of representatives of the national data protection authorities and the European Data Protection Supervisor (EDPS). Its responsibilities include issuing guidelines, recommendations, and best practices on the interpretation and application of the GDPR, resolving disputes between supervisory authorities, and advising the European Commission on data protection matters."
    },
    # Question 16 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act address the issue of bias and discrimination in AI systems?",
        'answer': "The AI Act mandates that AI systems, particularly high-risk ones, be designed and developed in a manner that prevents, identifies, and mitigates biases that could lead to discriminatory outcomes. Providers must take measures to ensure that AI systems do not produce results that unfairly disadvantage individuals or groups based on protected characteristics such as race, gender, or religion. This includes using diverse datasets, conducting bias audits, and implementing corrective measures to address any identified biases. The Act also emphasizes the importance of human oversight in preventing and addressing bias."
    },
    # Question 16 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of mergers and acquisitions by gatekeepers?",
        'answer': "The DMA requires gatekeepers to inform the European Commission of any intended mergers, acquisitions, or concentrations involving other providers of core platform services or digital services. This notification requirement allows the Commission to assess whether the proposed transaction would undermine the objectives of the DMA, such as by reinforcing the gatekeeper's market power or reducing competition in digital markets. The DMA's provisions on mergers and acquisitions are intended to prevent gatekeepers from consolidating their dominance through strategic acquisitions and to ensure that competition remains robust in digital markets."
    },
    # Question 16 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA handle the issue of online harassment and abuse?",
        'answer': "The DSA requires platforms to implement measures to combat online harassment and abuse, including providing users with tools to report and block abusive content and behavior. Platforms must act swiftly to remove or disable access to content that constitutes harassment or abuse, and they must provide support to victims. The DSA also encourages platforms to collaborate with law enforcement and civil society organizations to address online harassment and to develop best practices for creating a safe online environment. These measures are intended to protect users from harm and to promote a respectful and inclusive digital space."
    },
    # Question 17 from GDPR
    {
        'law': 'gdpr',
        'question': "How does the GDPR address the issue of consent in data processing?",
        'answer': "Under the GDPR, consent must be freely given, specific, informed, and unambiguous. Organizations must ensure that consent is obtained through a clear affirmative action, such as ticking a box on a website, and that it is distinguishable from other matters. The data subject must be informed of their right to withdraw consent at any time, and withdrawal must be as easy as giving consent. Additionally, for children below the age of 16, parental consent is required for processing their data."
    },
    # Question 17 from AI Act
    {
        'law': 'ai_act',
        'question': "What is the role of the European Artificial Intelligence Board (EAIB) under the AI Act?",
        'answer': "The European Artificial Intelligence Board (EAIB) is established under the AI Act to facilitate cooperation and coordination among national supervisory authorities and the European Commission. The EAIB is responsible for issuing guidelines, recommendations, and best practices on the implementation of the AI Act, providing advice to the European Commission on AI-related matters, and promoting the harmonized application of the Act across the EU. The EAIB also plays a role in resolving disputes between national authorities and ensuring consistency in the interpretation and enforcement of the AI Act."
    },
    # Question 17 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of dark patterns and deceptive design practices by gatekeepers?",
        'answer': "The DMA prohibits gatekeepers from using dark patterns and deceptive design practices that manipulate or deceive users into making decisions that are not in their best interests. This includes practices such as hiding important information, making it difficult for users to exercise their rights, or nudging users toward certain choices. The DMA requires gatekeepers to provide clear and accessible information to users and to design their interfaces in a way that respects user autonomy and choice. These provisions are intended to protect consumers from manipulative practices and to ensure that digital services are transparent and user-friendly."
    },
    # Question 17 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA ensure that users have control over their data and privacy?",
        'answer': "The DSA enhances user control over data and privacy by requiring platforms to provide clear and accessible information about how user data is collected, processed, and used. Users must be informed about their rights to access, rectify, and delete their data, as well as their right to object to data processing. The DSA also requires platforms to implement privacy-by-design and privacy-by-default principles, ensuring that users' privacy is protected from the outset. Additionally, platforms must provide users with tools to manage their privacy settings and to control the use of their data for targeted advertising."
    },
    # Question 18 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the GDPR’s approach to international data transfers?",
        'answer': "The GDPR allows international data transfers only if the third country, territory, or international organization ensures an adequate level of data protection, as determined by the European Commission. In the absence of an adequacy decision, transfers are permitted under appropriate safeguards, such as binding corporate rules or standard contractual clauses. In specific circumstances, derogations for specific situations, such as explicit consent of the data subject, may allow transfers. The GDPR aims to ensure that personal data transferred outside the EU is afforded the same level of protection as within the EU."
    },
    # Question 18 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act impact the use of AI in healthcare?",
        'answer': "The AI Act recognizes the potential benefits of AI in healthcare, such as improving diagnosis, treatment, and patient outcomes. However, it also acknowledges the risks associated with the use of AI in this sensitive sector. AI systems used in healthcare, particularly those that involve decision-making or provide recommendations to healthcare professionals, are classified as high-risk and are subject to strict requirements. These include ensuring the accuracy and reliability of AI systems, maintaining human oversight, and safeguarding patient data. The Act also emphasizes the importance of transparency and informed consent in the use of AI in healthcare."
    },
    # Question 18 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA promote transparency in digital advertising?",
        'answer': "The DMA promotes transparency in digital advertising by requiring gatekeepers to provide advertisers and publishers with access to data related to their advertising campaigns, including information on pricing, performance, and targeting criteria. Gatekeepers must also ensure that their advertising services are offered on fair, reasonable, and non-discriminatory terms, and they are prohibited from using non-public data to gain an unfair advantage in the advertising market. These provisions are intended to promote competition and transparency in digital advertising, ensuring that advertisers and publishers have the information they need to make informed decisions."
    },
    # Question 18 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA address the issue of algorithmic transparency and accountability?",
        'answer': "The DSA requires platforms, particularly VLOPs, to provide transparency about how their algorithms work, including the criteria used for content recommendation, ranking, and removal. Platforms must explain the logic behind their algorithms and provide users with options to control how algorithms affect their online experience. The DSA also mandates that platforms conduct regular audits of their algorithms to assess their impact on users and society. These audits must be conducted by independent third parties and must evaluate whether the algorithms are fair, non-discriminatory, and aligned with fundamental rights."
    },
    # Question 19 from GDPR
    {
        'law': 'gdpr',
        'question': "What rights do data subjects have in relation to automated decision-making under the GDPR?",
        'answer': "Under the GDPR, data subjects have the right not to be subject to a decision based solely on automated processing, including profiling, which produces legal effects or similarly significant effects concerning them. Exceptions include situations where automated decision-making is necessary for entering into or performing a contract, authorized by Union or Member State law, or based on explicit consent. In such cases, organizations must implement safeguards to protect the data subject's rights, such as the right to obtain human intervention, express their point of view, and contest the decision."
    },
    # Question 19 from AI Act
    {
        'law': 'ai_act',
        'question': "How does the AI Act address the issue of AI literacy and public awareness?",
        'answer': "The AI Act encourages initiatives to promote AI literacy and public awareness, recognizing that informed and educated citizens are essential for the responsible adoption of AI technologies. The Act calls for the development of educational programs and resources to help individuals understand the capabilities, limitations, and risks associated with AI. It also promotes public consultations and stakeholder engagement to ensure that the perspectives of various groups, including civil society, are considered in the development and deployment of AI systems."
    },
    # Question 19 from DMA
    {
        'law': 'dma',
        'question': "How does the DMA address the issue of access to core platform services by end users?",
        'answer': "The DMA ensures that end users have access to core platform services on fair and non-discriminatory terms. Gatekeepers are prohibited from restricting or degrading the quality of access to their services or from engaging in practices that limit user choice, such as forcing users to install certain apps or use specific services. The DMA also promotes data portability, allowing end users to transfer their data to other services and take advantage of competitive offerings. These provisions are designed to enhance user choice and control over the digital services they use."
    },
    # Question 19 from DSA
    {
        'law': 'dsa',
        'question': "What are the requirements for online platforms to cooperate with regulatory authorities under the DSA?",
        'answer': "The DSA requires online platforms to cooperate with regulatory authorities by providing them with access to data, records, and information necessary for monitoring and enforcement purposes. Platforms must respond promptly to requests from authorities and must facilitate inspections and investigations. The DSA also mandates that platforms provide transparency reports and undergo independent audits to demonstrate compliance with the regulation. Cooperation with authorities is essential for ensuring that platforms meet their obligations and that the DSA's provisions are effectively enforced."
    },
    # Question 20 from GDPR
    {
        'law': 'gdpr',
        'question': "What is the GDPR's stance on the appointment of a Data Protection Officer (DPO) and when is it mandatory?",
        'answer': "The GDPR mandates the appointment of a Data Protection Officer (DPO) in specific cases: when processing is carried out by a public authority or body, except for courts acting in their judicial capacity; when the core activities of the controller or processor consist of processing operations that require regular and systematic monitoring of data subjects on a large scale; or when the core activities consist of processing special categories of data on a large scale. The DPO must have expert knowledge of data protection law and practices and is responsible for advising the organization on GDPR compliance and monitoring its implementation."
    },
    # Question 20 from AI Act
    {
        'law': 'ai_act',
        'question': "What measures does the AI Act include to support the ethical development of AI?",
        'answer': "The AI Act supports the ethical development of AI by encouraging the adoption of voluntary codes of conduct, fostering research on ethical AI, and promoting the development of AI systems that align with European values and fundamental rights. The Act emphasizes the importance of human-centric AI, where AI systems are designed to enhance human capabilities and well-being while respecting human dignity and autonomy. It also supports the creation of regulatory sandboxes to allow developers to experiment with innovative AI solutions in a controlled environment, ensuring that ethical considerations are integrated into the design and deployment of AI technologies."
    },
    # Question 20 from DMA
    {
        'law': 'dma',
        'question': "What role does the European Commission play in enforcing the DMA?",
        'answer': "The European Commission is responsible for enforcing the DMA, including monitoring compliance, conducting investigations, and imposing penalties for non-compliance. The Commission has the authority to impose fines, periodic penalty payments, and structural remedies on gatekeepers that violate the DMA's obligations and prohibitions. The Commission also has the power to initiate market investigations to assess whether new services should be designated as core platform services or whether additional obligations should be imposed on gatekeepers. The enforcement of the DMA is designed to be robust and effective, ensuring that gatekeepers operate in a manner that promotes competition and innovation in digital markets."
    },
    # Question 20 from DSA
    {
        'law': 'dsa',
        'question': "How does the DSA promote the development of codes of conduct for online platforms?",
        'answer': "The DSA encourages the development of codes of conduct for online platforms to address specific issues such as content moderation, algorithmic transparency, and the protection of minors. These codes of conduct are developed in collaboration with industry stakeholders, civil society organizations, and regulatory authorities. The DSA promotes the adoption of these voluntary measures to ensure that platforms operate in a responsible and ethical manner. The codes of conduct provide a framework for best practices and help platforms to align their operations with the DSA's objectives, while also allowing for flexibility and innovation."
    },
]

# Update the laws_info dictionary for GDPR, AI Act, DMA, and DSA
laws_info = {
    'gdpr': {
        'file_path': '/kaggle/input/english-dataset/english_gdpr.html',
        'collection_name': 'chunk-embeddings-index',
        'questions_answers': [qa for qa in integrated_questions_answers if qa['law'] == 'gdpr']
    },
    'ai_act': {
        'file_path': '/kaggle/input/english-dataset/english_AI_act.html',
        'collection_name': 'chunk-embeddings-index',
        'questions_answers': [qa for qa in integrated_questions_answers if qa['law'] == 'ai_act']
    },
    'dma': {
        'file_path': '/kaggle/input/english-dataset/english_dma.html',
        'collection_name': 'DMA_Chunk',
        'questions_answers': [qa for qa in integrated_questions_answers if qa['law'] == 'dma']
    },
    'dsa': {
        'file_path': '/kaggle/input/english-dataset/english_dsa.html',
        'collection_name': 'DSA_Chunk',
        'questions_answers': [qa for qa in integrated_questions_answers if qa['law'] == 'dsa']
    },
}

In [14]:
# Iterate through each law in the laws_info dictionary
for law, law_info in laws_info.items():
    print(f"\nProcessing law: {law.upper()}")
    
    # Iterate through each question-answer pair for the current law
    for qa in law_info['questions_answers']:
        question = qa['question']
        reference_answer = qa['answer']
        
        print(f"Querying for question: {question}")

        # Step 1: Embed the query text (question)
        query_embedding = embed_query(question, model_name)

        # Step 2: Query the Pinecone index using the appropriate namespace (law)
        results = query_pinecone_db(query_embedding, index, namespace=law, top_k=1)

        # Check if results are retrieved successfully
        if results and 'matches' in results and results['matches']:
            # Get the ID of the most relevant chunk
            retrieved_chunk_id = results['matches'][0]['id']
            
            # Retrieve the actual text chunk using the ID
            if law in chunks_dict:
                # Extract the numeric part of the ID to get the correct index
                retrieved_chunk_index = int(retrieved_chunk_id.split('_')[-1])
                retrieved_chunk = chunks_dict[law][retrieved_chunk_index]

                print(f"Retrieved chunk {retrieved_chunk_index} from {law.upper()}:")
                print(retrieved_chunk)

                # Step 3: Compute Cosine Similarity using BERT embeddings
                retrieved_embedding = generate_bert_embedding(retrieved_chunk, tokenizer, model)
                reference_embedding = generate_bert_embedding(reference_answer, tokenizer, model)
                cosine_sim = calculate_cosine_similarity(reference_embedding, retrieved_embedding)
                
                # Step 4: Compute Semantic Similarity using Sentence-Transformers model
                semantic_sim = calculate_semantic_similarity(reference_answer, retrieved_chunk, semantic_model)

                # Store and print the results
                similarities.append({
                    'law': law,
                    'question': question,
                    'retrieved_chunk': retrieved_chunk,
                    'cosine_similarity': cosine_sim,
                    'semantic_similarity': semantic_sim
                })

                print(f"Cosine Similarity with reference answer: {cosine_sim:.4f}")
                print(f"Semantic Similarity with reference answer: {semantic_sim:.4f}")
                print("----\n")
            else:
                print(f"Error: '{law}' not found in chunks_dict.")
        else:
            print(f"No results found for {law.upper()} for the question: {question}")



Processing law: GDPR
Querying for question: What is the fundamental right regarding the processing of personal data as per the Charter of Fundamental Rights of the European Union?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from GDPR:
(2) The principles of, and rules on the protection of natural persons with regard to the processing of their personal data should, whatever their nationality or residence, respect their fundamental rights and freedoms, in particular their right to the protection of personal data. This Regulation is intended to contribute to the accomplishment of an area of freedom, security and justice and of an economic union, to economic and social progress, to the strengthening and the convergence of the economies within the internal market, and to the well-being of natural persons. (3) Directive 95/46/EC of the European Parliament and of the Council (4) seeks to harmonise the protection of fundamental rights and freedoms of natural persons in respect of processing activities and to ensure the free flow of personal data between Member States. (4) The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an ab

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9755
Semantic Similarity with reference answer: 0.9732
----

Querying for question: How does GDPR aim to balance the right to the protection of personal data with other fundamental rights?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from GDPR:
(2) The principles of, and rules on the protection of natural persons with regard to the processing of their personal data should, whatever their nationality or residence, respect their fundamental rights and freedoms, in particular their right to the protection of personal data. This Regulation is intended to contribute to the accomplishment of an area of freedom, security and justice and of an economic union, to economic and social progress, to the strengthening and the convergence of the economies within the internal market, and to the well-being of natural persons. (3) Directive 95/46/EC of the European Parliament and of the Council (4) seeks to harmonise the protection of fundamental rights and freedoms of natural persons in respect of processing activities and to ensure the free flow of personal data between Member States. (4) The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an ab

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9781
Semantic Similarity with reference answer: 0.9789
----

Querying for question: What challenges have arisen due to technological developments and globalization in the context of personal data protection?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 2 from GDPR:
(5) The economic and social integration resulting from the functioning of the internal market has led to a substantial increase in cross-border flows of personal data. The exchange of personal data between public and private actors, including natural persons, associations and undertakings across the Union has increased. National authorities in the Member States are being called upon by Union law to cooperate and exchange personal data so as to be able to perform their duties or carry out tasks on behalf of an authority in another Member State. (6) Rapid technological developments and globalisation have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information avai

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9777
Semantic Similarity with reference answer: 0.9738
----

Querying for question: How does the GDPR address the transfer of personal data to third countries or international organizations?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 2 from GDPR:
(5) The economic and social integration resulting from the functioning of the internal market has led to a substantial increase in cross-border flows of personal data. The exchange of personal data between public and private actors, including natural persons, associations and undertakings across the Union has increased. National authorities in the Member States are being called upon by Union law to cooperate and exchange personal data so as to be able to perform their duties or carry out tasks on behalf of an authority in another Member State. (6) Rapid technological developments and globalisation have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information avai

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9362
Semantic Similarity with reference answer: 0.8995
----

Querying for question: What specific protections does GDPR offer to children regarding their personal data?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 111 from GDPR:
Member States may provide by law for a lower age for those purposes provided that such lower age is not below 13 years. 2. The controller shall make reasonable efforts to verify in such cases that consent is given or authorised by the holder of parental responsibility over the child, taking into consideration available technology. 3. Paragraph 1 shall not affect the general contract law of Member States such as the rules on the validity, formation or effect of a contract in relation to a child. Article 9 Processing of special categories of personal data 1. Processing of personal data revealing racial or ethnic origin, political opinions, religious or philosophical beliefs, or trade union membership, and the processing of genetic data, biometric data for the purpose of uniquely identifying a natural person, data concerning health or data concerning a natural person's sex life or sexual orientation shall be prohibited. 2.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9063
Semantic Similarity with reference answer: 0.9265
----

Querying for question: How does the GDPR define personal data, and what are some examples?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 96 from GDPR:
(161) For the purpose of consenting to the participation in scientific research activities in clinical trials, the relevant provisions of Regulation (EU) No 536/2014 of the European Parliament and of the Council (15) should apply. (162) Where personal data are processed for statistical purposes, this Regulation should apply to that processing. Union or Member State law should, within the limits of this Regulation, determine statistical content, control of access, specifications for the processing of personal data for statistical purposes and appropriate measures to safeguard the rights and freedoms of the data subject and for ensuring statistical confidentiality. Statistical purposes mean any operation of collection and the processing of personal data necessary for statistical surveys or for the production of statistical results. Those statistical results may further be used for different purposes, including a scientific research purpose. The statistical p

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8844
Semantic Similarity with reference answer: 0.8730
----

Querying for question: What is the legal basis for processing personal data under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 111 from GDPR:
Member States may provide by law for a lower age for those purposes provided that such lower age is not below 13 years. 2. The controller shall make reasonable efforts to verify in such cases that consent is given or authorised by the holder of parental responsibility over the child, taking into consideration available technology. 3. Paragraph 1 shall not affect the general contract law of Member States such as the rules on the validity, formation or effect of a contract in relation to a child. Article 9 Processing of special categories of personal data 1. Processing of personal data revealing racial or ethnic origin, political opinions, religious or philosophical beliefs, or trade union membership, and the processing of genetic data, biometric data for the purpose of uniquely identifying a natural person, data concerning health or data concerning a natural person's sex life or sexual orientation shall be prohibited. 2.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9473
Semantic Similarity with reference answer: 0.9408
----

Querying for question: What are the rights of data subjects under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 44 from GDPR:
(74) The responsibility and liability of the controller for any processing of personal data carried out by the controller or on the controller's behalf should be established. In particular, the controller should be obliged to implement appropriate and effective measures and be able to demonstrate the compliance of processing activities with this Regulation, including the effectiveness of the measures. Those measures should take into account the nature, scope, context and purposes of the processing and the risk to the rights and freedoms of natural persons.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9197
Semantic Similarity with reference answer: 0.8699
----

Querying for question: How does the GDPR address data protection by design and by default?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 47 from GDPR:
Such measures could consist, inter alia, of minimising the processing of personal data, pseudonymising personal data as soon as possible, transparency with regard to the functions and processing of personal data, enabling the data subject to monitor the data processing, enabling the controller to create and improve security features. When developing, designing, selecting and using applications, services and products that are based on the processing of personal data or process personal data to fulfil their task, producers of the products, services and applications should be encouraged to take into account the right to data protection when developing and designing such products, services and applications and, with due regard to the state of the art, to make sure that controllers and processors are able to fulfil their data protection obligations. The principles of data protection by design and by default should also be taken into consideration in the context

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9523
Semantic Similarity with reference answer: 0.9355
----

Querying for question: What is the role of the Data Protection Officer (DPO) under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 80 from GDPR:
The Board should be represented by its Chair. It should replace the Working Party on the Protection of Individuals with Regard to the Processing of Personal Data established by Directive 95/46/EC. It should consist of the head of a supervisory authority of each Member State and the European Data Protection Supervisor or their respective representatives. The Commission should participate in the Board's activities without voting rights and the European Data Protection Supervisor should have specific voting rights. The Board should contribute to the consistent application of this Regulation throughout the Union, including by advising the Commission, in particular on the level of protection in third countries or international organisations, and promoting cooperation of the supervisory authorities throughout the Union. The Board should act independently when performing its tasks. (140) The Board should be assisted by a secretariat provided by the European Data 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9302
Semantic Similarity with reference answer: 0.9055
----

Querying for question: What are the implications of the GDPR for cross-border data processing activities?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 2 from GDPR:
(5) The economic and social integration resulting from the functioning of the internal market has led to a substantial increase in cross-border flows of personal data. The exchange of personal data between public and private actors, including natural persons, associations and undertakings across the Union has increased. National authorities in the Member States are being called upon by Union law to cooperate and exchange personal data so as to be able to perform their duties or carry out tasks on behalf of an authority in another Member State. (6) Rapid technological developments and globalisation have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information avai

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9373
Semantic Similarity with reference answer: 0.8976
----

Querying for question: How does the GDPR handle data breaches, and what are the obligations of data controllers in such cases?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 47 from GDPR:
Such measures could consist, inter alia, of minimising the processing of personal data, pseudonymising personal data as soon as possible, transparency with regard to the functions and processing of personal data, enabling the data subject to monitor the data processing, enabling the controller to create and improve security features. When developing, designing, selecting and using applications, services and products that are based on the processing of personal data or process personal data to fulfil their task, producers of the products, services and applications should be encouraged to take into account the right to data protection when developing and designing such products, services and applications and, with due regard to the state of the art, to make sure that controllers and processors are able to fulfil their data protection obligations. The principles of data protection by design and by default should also be taken into consideration in the context

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9545
Semantic Similarity with reference answer: 0.9159
----

Querying for question: What are the restrictions on processing special categories of personal data under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 144 from GDPR:
The obligations referred to in paragraphs 1 and 2 shall not apply to an enterprise or an organisation employing fewer than 250 persons unless the processing it carries out is likely to result in a risk to the rights and freedoms of data subjects, the processing is not occasional, or the processing includes special categories of data as referred to in Article 9(1) or personal data relating to criminal convictions and offences referred to in Article 10. Article 31 Cooperation with the supervisory authority The controller and the processor and, where applicable, their representatives, shall cooperate, on request, with the supervisory authority in the performance of its tasks. Section 2
 

Security of personal data

 Article 32 Security of processing 1.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9107
Semantic Similarity with reference answer: 0.8951
----

Querying for question: How does the GDPR regulate automated decision-making and profiling?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 2 from GDPR:
(5) The economic and social integration resulting from the functioning of the internal market has led to a substantial increase in cross-border flows of personal data. The exchange of personal data between public and private actors, including natural persons, associations and undertakings across the Union has increased. National authorities in the Member States are being called upon by Union law to cooperate and exchange personal data so as to be able to perform their duties or carry out tasks on behalf of an authority in another Member State. (6) Rapid technological developments and globalisation have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information avai

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9146
Semantic Similarity with reference answer: 0.8865
----

Querying for question: What penalties and enforcement actions are provided for under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 88 from GDPR:
(149) Member States should be able to lay down the rules on criminal penalties for infringements of this Regulation, including for infringements of national rules adopted pursuant to and within the limits of this Regulation. Those criminal penalties may also allow for the deprivation of the profits obtained through infringements of this Regulation. However, the imposition of criminal penalties for infringements of such national rules and of administrative penalties should not lead to a breach of the principle of ne bis in idem, as interpreted by the Court of Justice. (150) In order to strengthen and harmonise administrative penalties for infringements of this Regulation, each supervisory authority should have the power to impose administrative fines. This Regulation should indicate infringements and the upper limit and criteria for setting the related administrative fines, which should be determined by the competent supervisory authority in each individual

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9454
Semantic Similarity with reference answer: 0.9276
----

Querying for question: What is the role of the European Data Protection Board (EDPB) under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 80 from GDPR:
The Board should be represented by its Chair. It should replace the Working Party on the Protection of Individuals with Regard to the Processing of Personal Data established by Directive 95/46/EC. It should consist of the head of a supervisory authority of each Member State and the European Data Protection Supervisor or their respective representatives. The Commission should participate in the Board's activities without voting rights and the European Data Protection Supervisor should have specific voting rights. The Board should contribute to the consistent application of this Regulation throughout the Union, including by advising the Commission, in particular on the level of protection in third countries or international organisations, and promoting cooperation of the supervisory authorities throughout the Union. The Board should act independently when performing its tasks. (140) The Board should be assisted by a secretariat provided by the European Data 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9567
Semantic Similarity with reference answer: 0.9358
----

Querying for question: How does the GDPR address the issue of consent in data processing?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 33 from GDPR:
This is of particular relevance in situations where the proliferation of actors and the technological complexity of practice make it difficult for the data subject to know and understand whether, by whom and for what purpose personal data relating to him or her are being collected, such as in the case of online advertising. Given that children merit specific protection, any information and communication, where processing is addressed to a child, should be in such a clear and plain language that the child can easily understand. (59) Modalities should be provided for facilitating the exercise of the data subject's rights under this Regulation, including mechanisms to request and, if applicable, obtain, free of charge, in particular, access to and rectification or erasure of personal data and the exercise of the right to object. The controller should also provide means for requests to be made electronically, especially where personal data are processed by ele

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9507
Semantic Similarity with reference answer: 0.9275
----

Querying for question: What is the GDPR’s approach to international data transfers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 2 from GDPR:
(5) The economic and social integration resulting from the functioning of the internal market has led to a substantial increase in cross-border flows of personal data. The exchange of personal data between public and private actors, including natural persons, associations and undertakings across the Union has increased. National authorities in the Member States are being called upon by Union law to cooperate and exchange personal data so as to be able to perform their duties or carry out tasks on behalf of an authority in another Member State. (6) Rapid technological developments and globalisation have brought new challenges for the protection of personal data. The scale of the collection and sharing of personal data has increased significantly. Technology allows both private companies and public authorities to make use of personal data on an unprecedented scale in order to pursue their activities. Natural persons increasingly make personal information avai

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9284
Semantic Similarity with reference answer: 0.8960
----

Querying for question: What rights do data subjects have in relation to automated decision-making under the GDPR?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 42 from GDPR:
In any case, such processing should be subject to suitable safeguards, which should include specific information to the data subject and the right to obtain human intervention, to express his or her point of view, to obtain an explanation of the decision reached after such assessment and to challenge the decision. Such measure should not concern a child. In order to ensure fair and transparent processing in respect of the data subject, taking into account the specific circumstances and context in which the personal data are processed, the controller should use appropriate mathematical or statistical procedures for the profiling, implement technical and organisational measures appropriate to ensure, in particular, that factors which result in inaccuracies in personal data are corrected and the risk of errors is minimised, secure personal data in a manner that takes account of the potential risks involved for the interests and rights of the data subject and 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9492
Semantic Similarity with reference answer: 0.9338
----

Querying for question: What is the GDPR's stance on the appointment of a Data Protection Officer (DPO) and when is it mandatory?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 80 from GDPR:
The Board should be represented by its Chair. It should replace the Working Party on the Protection of Individuals with Regard to the Processing of Personal Data established by Directive 95/46/EC. It should consist of the head of a supervisory authority of each Member State and the European Data Protection Supervisor or their respective representatives. The Commission should participate in the Board's activities without voting rights and the European Data Protection Supervisor should have specific voting rights. The Board should contribute to the consistent application of this Regulation throughout the Union, including by advising the Commission, in particular on the level of protection in third countries or international organisations, and promoting cooperation of the supervisory authorities throughout the Union. The Board should act independently when performing its tasks. (140) The Board should be assisted by a secretariat provided by the European Data 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9469
Semantic Similarity with reference answer: 0.9305
----


Processing law: AI_ACT
Querying for question: What are the main objectives of the AI Act concerning the development and use of AI in the European Union?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 231 from AI_ACT:
The participants in the standardisation process shall seek to promote investment and innovation in AI, including through increasing legal certainty, as well as the competitiveness and growth of the Union market, to contribute to strengthening global cooperation on standardisation and taking into account existing international standards in the field of AI that are consistent with Union values, fundamental rights and interests, and to enhance multi-stakeholder governance ensuring a balanced representation of interests and the effective participation of all relevant stakeholders in accordance with Articles 5, 6, and 7 of Regulation (EU) No 1025/2012. Article 41 Common specifications 1.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9343
Semantic Similarity with reference answer: 0.9146
----

Querying for question: How does the AI Act propose to regulate high-risk AI systems?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9260
Semantic Similarity with reference answer: 0.9080
----

Querying for question: What responsibilities does the AI Act place on AI providers to ensure ethical AI practices?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9215
Semantic Similarity with reference answer: 0.9213
----

Querying for question: How does the AI Act address transparency and accountability in AI systems?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9105
Semantic Similarity with reference answer: 0.8735
----

Querying for question: What measures are suggested by the AI Act to protect fundamental rights in the deployment of AI technologies?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9182
Semantic Similarity with reference answer: 0.8802
----

Querying for question: What categories of AI systems are considered high-risk under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9223
Semantic Similarity with reference answer: 0.8556
----

Querying for question: How does the AI Act define 'AI system' and what technologies fall under this definition?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 110 from AI_ACT:
Consequently, the corresponding obligations of this Regulation should be presumed to be fulfilled, unless significant systemic risks not covered by Regulation (EU) 2022/2065 emerge and are identified in such models. Within this framework, providers of very large online platforms and very large online search engines are obliged to assess potential systemic risks stemming from the design, functioning and use of their services, including how the design of algorithmic systems used in the service may contribute to such risks, as well as systemic risks stemming from potential misuses. Those providers are also obliged to take appropriate mitigating measures in observance of fundamental rights. (119) Considering the quick pace of innovation and the technological evolution of digital services in scope of different instruments of Union law in particular having in mind the usage and the perception of their recipients, the AI systems subject to this Regulation may 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9139
Semantic Similarity with reference answer: 0.9172
----

Querying for question: What obligations do users of high-risk AI systems have under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9261
Semantic Similarity with reference answer: 0.9032
----

Querying for question: How does the AI Act address the use of biometric identification systems?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9146
Semantic Similarity with reference answer: 0.8495
----

Querying for question: What are the requirements for conformity assessments under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9298
Semantic Similarity with reference answer: 0.9161
----

Querying for question: What role do national supervisory authorities play under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9206
Semantic Similarity with reference answer: 0.8727
----

Querying for question: How does the AI Act encourage innovation while ensuring safety and compliance?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9244
Semantic Similarity with reference answer: 0.8936
----

Querying for question: How does the AI Act address the transparency of AI systems?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9156
Semantic Similarity with reference answer: 0.8747
----

Querying for question: What are the obligations related to data quality under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9146
Semantic Similarity with reference answer: 0.8763
----

Querying for question: How does the AI Act regulate the use of AI in law enforcement and public safety?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 191 from AI_ACT:
Article 17 Quality management system 1. Providers of high-risk AI systems shall put a quality management system in place that ensures compliance with this Regulation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9154
Semantic Similarity with reference answer: 0.8892
----

Querying for question: How does the AI Act address the issue of bias and discrimination in AI systems?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 26 from AI_ACT:
Social and environmental well-being means that AI systems are developed and used in a sustainable and environmentally friendly manner as well as in a way to benefit all human beings, while monitoring and assessing the long-term impacts on the individual, society and democracy. The application of those principles should be translated, when possible, in the design and use of AI models. They should in any case serve as a basis for the drafting of codes of conduct under this Regulation. All stakeholders, including industry, academia, civil society and standardisation organisations, are encouraged to take into account, as appropriate, the ethical principles for the development of voluntary best practices and standards. (28) Aside from the many beneficial uses of AI, it can also be misused and provide novel and powerful tools for manipulative, exploitative and social control practices. Such practices are particularly harmful and abusive and should be prohibite

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9309
Semantic Similarity with reference answer: 0.9130
----

Querying for question: What is the role of the European Artificial Intelligence Board (EAIB) under the AI Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 327 from AI_ACT:
Article 94 Procedural rights of economic operators of the general-purpose AI model Article 18 of Regulation (EU) 2019/1020 shall apply mutatis mutandis to the providers of the general-purpose AI model, without prejudice to more specific procedural rights provided for in this Regulation. CHAPTER X 
CODES OF CONDUCT AND GUIDELINES
 Article 95 Codes of conduct for voluntary application of specific requirements 1. The AI Office and the Member States shall encourage and facilitate the drawing up of codes of conduct, including related governance mechanisms, intended to foster the voluntary application to AI systems, other than high-risk AI systems, of some or all of the requirements set out in Chapter III, Section 2 taking into account the available technical solutions and industry best practices allowing for the application of such requirements. 2.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9314
Semantic Similarity with reference answer: 0.9098
----

Querying for question: How does the AI Act impact the use of AI in healthcare?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 42 from AI_ACT:
(44) There are serious concerns about the scientific basis of AI systems aiming to identify or infer emotions, particularly as expression of emotions vary considerably across cultures and situations, and even within a single individual. Among the key shortcomings of such systems are the limited reliability, the lack of specificity and the limited generalisability. Therefore, AI systems identifying or inferring emotions or intentions of natural persons on the basis of their biometric data may lead to discriminatory outcomes and can be intrusive to the rights and freedoms of the concerned persons. Considering the imbalance of power in the context of work or education, combined with the intrusive nature of these systems, such systems could lead to detrimental or unfavourable treatment of certain natural persons or whole groups thereof. Therefore, the placing on the market, the putting into service, or the use of AI systems intended to be used to detect the 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9463
Semantic Similarity with reference answer: 0.9244
----

Querying for question: How does the AI Act address the issue of AI literacy and public awareness?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 18 from AI_ACT:
Those notions may vary with regard to the relevant context and can include understanding the correct application of technical elements during the AI system’s development phase, the measures to be applied during its use, the suitable ways in which to interpret the AI system’s output, and, in the case of affected persons, the knowledge necessary to understand how decisions taken with the assistance of AI will have an impact on them. In the context of the application this Regulation, AI literacy should provide all relevant actors in the AI value chain with the insights required to ensure the appropriate compliance and its correct enforcement. Furthermore, the wide implementation of AI literacy measures and the introduction of appropriate follow-up actions could contribute to improving working conditions and ultimately sustain the consolidation, and innovation path of trustworthy AI in the Union. The European Artificial Intelligence Board (the ‘Board’) shoul

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9294
Semantic Similarity with reference answer: 0.9159
----

Querying for question: What measures does the AI Act include to support the ethical development of AI?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 24 from AI_ACT:
(26) In order to introduce a proportionate and effective set of binding rules for AI systems, a clearly defined risk-based approach should be followed. That approach should tailor the type and content of such rules to the intensity and scope of the risks that AI systems can generate. It is therefore necessary to prohibit certain unacceptable AI practices, to lay down requirements for high-risk AI systems and obligations for the relevant operators, and to lay down transparency obligations for certain AI systems. (27) While the risk-based approach is the basis for a proportionate and effective set of binding rules, it is important to recall the 2019 Ethics guidelines for trustworthy AI developed by the independent AI HLEG appointed by the Commission. In those guidelines, the AI HLEG developed seven non-binding ethical principles for AI which are intended to help ensure that AI is trustworthy and ethically sound. The seven principles include human agency an

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9538
Semantic Similarity with reference answer: 0.9366
----


Processing law: DMA
Querying for question: What criteria are used to define a 'gatekeeper' under the Digital Markets Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8404
Semantic Similarity with reference answer: 0.8375
----

Querying for question: How does the DMA propose to regulate the behavior of gatekeepers in digital markets?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from DMA:
A majority of Union citizens now uses those services on a daily basis. However, the digital transformation and increased use of those services has also resulted in new risks and challenges for individual recipients of the relevant service, companies and society as a whole. (2) Member States are increasingly introducing, or are considering introducing, national laws on the matters covered by this Regulation, imposing, in particular, diligence requirements for providers of intermediary services as regards the way they should tackle illegal content, online disinformation or other societal risks. Those diverging national laws negatively affect the internal market, which, pursuant to Article 26 of the Treaty on the Functioning of the European Union (TFEU), comprises an area without internal frontiers in which the free movement of goods and services and freedom of establishment are ensured, taking into account the inherently cross-border nature of the internet, wh

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9187
Semantic Similarity with reference answer: 0.9139
----

Querying for question: What are the key obligations imposed on gatekeepers by the DMA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8867
Semantic Similarity with reference answer: 0.8522
----

Querying for question: How does the DMA aim to prevent unfair practices in the digital market?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9124
Semantic Similarity with reference answer: 0.8845
----

Querying for question: What enforcement mechanisms are included in the DMA to ensure compliance by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9199
Semantic Similarity with reference answer: 0.9001
----

Querying for question: How does the DMA address the issue of self-preferencing by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 89 from DMA:
The mere fact of participating in and implementing a given code of conduct should not in itself presume compliance with this Regulation. (105) The codes of conduct should facilitate the accessibility of very large online platforms and very large online search engines, in compliance with Union and national law, in order to facilitate their foreseeable use by persons with disabilities. In particular, the codes of conduct could ensure that the information is presented in a perceivable, operable, understandable and robust way and that forms and measures provided pursuant to this Regulation are made available in a manner that is easy to find and accessible to persons with disabilities. (106) The rules on codes of conduct under this Regulation could serve as a basis for already established self-regulatory efforts at Union level, including the Product Safety Pledge, the Memorandum of understanding on the sale of counterfeit goods on the internet, the Code of condu

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9240
Semantic Similarity with reference answer: 0.9159
----

Querying for question: What are the criteria for identifying core platform services under the DMA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 74 from DMA:
In selecting the appropriate mitigation measures, providers can consider, where appropriate, industry best practices, including as established through self-regulatory cooperation, such as codes of conduct, and should take into account the guidelines from the Commission. (90) Providers of very large online platforms and of very large online search engines should ensure that their approach to risk assessment and mitigation is based on the best available information and scientific insights and that they test their assumptions with the groups most impacted by the risks and the measures they take. To this end, they should, where appropriate, conduct their risk assessments and design their risk mitigation measures with the involvement of representatives of the recipients of the service, representatives of groups potentially impacted by their services, independent experts and civil society organisations. They should seek to embed such consultations into their meth

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9149
Semantic Similarity with reference answer: 0.8598
----

Querying for question: How does the DMA promote interoperability between digital services?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from DMA:
A majority of Union citizens now uses those services on a daily basis. However, the digital transformation and increased use of those services has also resulted in new risks and challenges for individual recipients of the relevant service, companies and society as a whole. (2) Member States are increasingly introducing, or are considering introducing, national laws on the matters covered by this Regulation, imposing, in particular, diligence requirements for providers of intermediary services as regards the way they should tackle illegal content, online disinformation or other societal risks. Those diverging national laws negatively affect the internal market, which, pursuant to Article 26 of the Treaty on the Functioning of the European Union (TFEU), comprises an area without internal frontiers in which the free movement of goods and services and freedom of establishment are ensured, taking into account the inherently cross-border nature of the internet, wh

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9106
Semantic Similarity with reference answer: 0.8939
----

Querying for question: What obligations does the DMA impose on gatekeepers regarding data access and portability?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 20 from DMA:
Furthermore, where it is necessary to involve information society services providers, including providers of intermediary services, any requests or orders for such involvement should, as a general rule, be directed to the specific provider that has the technical and operational ability to act against specific items of illegal content, so as to prevent and minimise any possible negative effects on the availability and accessibility of information that is not illegal content. (28) Since 2000, new technologies have emerged that improve the availability, efficiency, speed, reliability, capacity and security of systems for the transmission, ‘findability’ and storage of data online, leading to an increasingly complex online ecosystem. In this regard, it should be recalled that providers of services establishing and facilitating the underlying logical architecture and proper functioning of the internet, including technical auxiliary functions, can also benefit fro

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9481
Semantic Similarity with reference answer: 0.9325
----

Querying for question: How does the DMA address the issue of tying and bundling practices by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 89 from DMA:
The mere fact of participating in and implementing a given code of conduct should not in itself presume compliance with this Regulation. (105) The codes of conduct should facilitate the accessibility of very large online platforms and very large online search engines, in compliance with Union and national law, in order to facilitate their foreseeable use by persons with disabilities. In particular, the codes of conduct could ensure that the information is presented in a perceivable, operable, understandable and robust way and that forms and measures provided pursuant to this Regulation are made available in a manner that is easy to find and accessible to persons with disabilities. (106) The rules on codes of conduct under this Regulation could serve as a basis for already established self-regulatory efforts at Union level, including the Product Safety Pledge, the Memorandum of understanding on the sale of counterfeit goods on the internet, the Code of condu

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9050
Semantic Similarity with reference answer: 0.9031
----

Querying for question: What are the consequences for gatekeepers that fail to comply with the DMA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 91 from DMA:
Such can be the case, for example, where online platforms are misused for the rapid spread of illegal content or disinformation or where the need arises for rapid dissemination of reliable information. In light of the important role of very large online platforms in disseminating information in our societies and across borders, providers of such platforms should be encouraged in drawing up and applying specific crisis protocols. Such crisis protocols should be activated only for a limited period of time and the measures adopted should also be limited to what is strictly necessary to address the extraordinary circumstance. Those measures should be consistent with this Regulation, and should not amount to a general obligation for the participating providers of very large online platforms and of very large online search engines to monitor the information which they transmit or store, nor actively to seek facts or circumstances indicating illegal content. (109)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9357
Semantic Similarity with reference answer: 0.9162
----

Querying for question: How does the DMA enhance consumer protection in digital markets?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from DMA:
A majority of Union citizens now uses those services on a daily basis. However, the digital transformation and increased use of those services has also resulted in new risks and challenges for individual recipients of the relevant service, companies and society as a whole. (2) Member States are increasingly introducing, or are considering introducing, national laws on the matters covered by this Regulation, imposing, in particular, diligence requirements for providers of intermediary services as regards the way they should tackle illegal content, online disinformation or other societal risks. Those diverging national laws negatively affect the internal market, which, pursuant to Article 26 of the Treaty on the Functioning of the European Union (TFEU), comprises an area without internal frontiers in which the free movement of goods and services and freedom of establishment are ensured, taking into account the inherently cross-border nature of the internet, wh

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9230
Semantic Similarity with reference answer: 0.9253
----

Querying for question: How does the DMA address the issue of access to business users' data by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 83 from DMA:
They should provide access to such researchers including, where technically possible, in real-time, to the publicly accessible data, for example on aggregated interactions with content from public pages, public groups, or public figures, including impression and engagement data such as the number of reactions, shares, comments from recipients of the service. Providers of very large online platforms or of very large online search engines should be encouraged to cooperate with researchers and provide broader access to data for monitoring societal concerns through voluntary efforts, including through commitments and procedures agreed under codes of conduct or crisis protocols. Those providers and researchers should pay particular attention to the protection of personal data, and ensure that any processing of personal data complies with Regulation (EU) 2016/679. Providers should anonymise or pseudonymise personal data except in those cases that would render imp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9170
Semantic Similarity with reference answer: 0.9354
----

Querying for question: How does the DMA ensure fair and non-discriminatory access to core platform services?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DMA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8700
Semantic Similarity with reference answer: 0.8789
----

Querying for question: How does the DMA promote innovation and competition in digital markets?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from DMA:
A majority of Union citizens now uses those services on a daily basis. However, the digital transformation and increased use of those services has also resulted in new risks and challenges for individual recipients of the relevant service, companies and society as a whole. (2) Member States are increasingly introducing, or are considering introducing, national laws on the matters covered by this Regulation, imposing, in particular, diligence requirements for providers of intermediary services as regards the way they should tackle illegal content, online disinformation or other societal risks. Those diverging national laws negatively affect the internal market, which, pursuant to Article 26 of the Treaty on the Functioning of the European Union (TFEU), comprises an area without internal frontiers in which the free movement of goods and services and freedom of establishment are ensured, taking into account the inherently cross-border nature of the internet, wh

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9008
Semantic Similarity with reference answer: 0.8975
----

Querying for question: How does the DMA address the issue of mergers and acquisitions by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 74 from DMA:
In selecting the appropriate mitigation measures, providers can consider, where appropriate, industry best practices, including as established through self-regulatory cooperation, such as codes of conduct, and should take into account the guidelines from the Commission. (90) Providers of very large online platforms and of very large online search engines should ensure that their approach to risk assessment and mitigation is based on the best available information and scientific insights and that they test their assumptions with the groups most impacted by the risks and the measures they take. To this end, they should, where appropriate, conduct their risk assessments and design their risk mitigation measures with the involvement of representatives of the recipients of the service, representatives of groups potentially impacted by their services, independent experts and civil society organisations. They should seek to embed such consultations into their meth

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9430
Semantic Similarity with reference answer: 0.9138
----

Querying for question: How does the DMA address the issue of dark patterns and deceptive design practices by gatekeepers?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 72 from DMA:
In this regard, for example, the Code of conduct on countering illegal hate speech online of 2016 sets a benchmark to process valid notifications for removal of illegal hate speech in less than 24 hours. Providers of very large online platforms, in particular those primarily used for the dissemination to the public of pornographic content, should diligently meet all their obligations under this Regulation in respect of illegal content constituting cyber violence, including illegal pornographic content, especially with regard to ensuring that victims can effectively exercise their rights in relation to content representing non-consensual sharing of intimate or manipulated material through the rapid processing of notices and removal of such content without undue delay. Other types of illegal content may require longer or shorter timelines for processing of notices, which will depend on the facts, circumstances and types of illegal content at hand. Those provi

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9408
Semantic Similarity with reference answer: 0.9244
----

Querying for question: How does the DMA promote transparency in digital advertising?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 89 from DMA:
The mere fact of participating in and implementing a given code of conduct should not in itself presume compliance with this Regulation. (105) The codes of conduct should facilitate the accessibility of very large online platforms and very large online search engines, in compliance with Union and national law, in order to facilitate their foreseeable use by persons with disabilities. In particular, the codes of conduct could ensure that the information is presented in a perceivable, operable, understandable and robust way and that forms and measures provided pursuant to this Regulation are made available in a manner that is easy to find and accessible to persons with disabilities. (106) The rules on codes of conduct under this Regulation could serve as a basis for already established self-regulatory efforts at Union level, including the Product Safety Pledge, the Memorandum of understanding on the sale of counterfeit goods on the internet, the Code of condu

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9176
Semantic Similarity with reference answer: 0.9317
----

Querying for question: How does the DMA address the issue of access to core platform services by end users?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 20 from DMA:
Furthermore, where it is necessary to involve information society services providers, including providers of intermediary services, any requests or orders for such involvement should, as a general rule, be directed to the specific provider that has the technical and operational ability to act against specific items of illegal content, so as to prevent and minimise any possible negative effects on the availability and accessibility of information that is not illegal content. (28) Since 2000, new technologies have emerged that improve the availability, efficiency, speed, reliability, capacity and security of systems for the transmission, ‘findability’ and storage of data online, leading to an increasingly complex online ecosystem. In this regard, it should be recalled that providers of services establishing and facilitating the underlying logical architecture and proper functioning of the internet, including technical auxiliary functions, can also benefit fro

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9411
Semantic Similarity with reference answer: 0.9230
----

Querying for question: What role does the European Commission play in enforcing the DMA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 125 from DMA:
(155) Since the objectives of this Regulation, namely to contribute to the proper functioning of the internal market and to ensure a safe, predictable and trusted online environment in which the fundamental rights enshrined in the Charter are duly protected, cannot be sufficiently achieved by the Member States because they cannot achieve the necessary harmonisation and cooperation by acting alone, but can rather, by reason of territorial and personal scope, be better achieved at the Union level, the Union may adopt measures, in accordance with the principle of subsidiarity as set out in Article 5 of the Treaty on European Union. In accordance with the principle of proportionality as set out in that Article, this Regulation does not go beyond what is necessary in order to achieve those objectives. (156) The European Data Protection Supervisor was consulted in accordance with Article 42(1) of Regulation (EU) 2018/1725 of the European Parliament and of the Co

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9282
Semantic Similarity with reference answer: 0.9096
----


Processing law: DSA
Querying for question: What are the main responsibilities of online platforms under the Digital Services Act?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9074
Semantic Similarity with reference answer: 0.9132
----

Querying for question: How does the DSA aim to protect users from illegal content on digital platforms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 55 from DSA:
The requirements of this Regulation on the provision of information relating to advertising is without prejudice to the application of the relevant provisions of Regulation (EU) 2016/679, in particular those regarding the right to object, automated individual decision-making, including profiling, and specifically the need to obtain consent of the data subject prior to the processing of personal data for targeted advertising. Similarly, it is without prejudice to the provisions laid down in Directive 2002/58/EC in particular those regarding the storage of information in terminal equipment and the access to information stored therein. Finally, this Regulation complements the application of the Directive 2010/13/EU which imposes measures to enable users to declare audiovisual commercial communications in user-generated videos. It also complements the obligations for traders regarding the disclosure of commercial communications deriving from Directive 2005/29/E

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9241
Semantic Similarity with reference answer: 0.9143
----

Querying for question: What transparency requirements are imposed on online platforms by the DSA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8814
Semantic Similarity with reference answer: 0.8802
----

Querying for question: How does the DSA propose to handle the dissemination of harmful content?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8829
Semantic Similarity with reference answer: 0.8972
----

Querying for question: What measures does the DSA include to protect freedom of expression while combating illegal content?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9124
Semantic Similarity with reference answer: 0.9115
----

Querying for question: How does the DSA address the issue of content moderation on online platforms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 83 from DSA:
They should provide access to such researchers including, where technically possible, in real-time, to the publicly accessible data, for example on aggregated interactions with content from public pages, public groups, or public figures, including impression and engagement data such as the number of reactions, shares, comments from recipients of the service. Providers of very large online platforms or of very large online search engines should be encouraged to cooperate with researchers and provide broader access to data for monitoring societal concerns through voluntary efforts, including through commitments and procedures agreed under codes of conduct or crisis protocols. Those providers and researchers should pay particular attention to the protection of personal data, and ensure that any processing of personal data complies with Regulation (EU) 2016/679. Providers should anonymise or pseudonymise personal data except in those cases that would render imp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9234
Semantic Similarity with reference answer: 0.9217
----

Querying for question: What obligations do very large online platforms (VLOPs) have under the DSA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 83 from DSA:
They should provide access to such researchers including, where technically possible, in real-time, to the publicly accessible data, for example on aggregated interactions with content from public pages, public groups, or public figures, including impression and engagement data such as the number of reactions, shares, comments from recipients of the service. Providers of very large online platforms or of very large online search engines should be encouraged to cooperate with researchers and provide broader access to data for monitoring societal concerns through voluntary efforts, including through commitments and procedures agreed under codes of conduct or crisis protocols. Those providers and researchers should pay particular attention to the protection of personal data, and ensure that any processing of personal data complies with Regulation (EU) 2016/679. Providers should anonymise or pseudonymise personal data except in those cases that would render imp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9622
Semantic Similarity with reference answer: 0.9444
----

Querying for question: How does the DSA enhance the protection of minors online?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8969
Semantic Similarity with reference answer: 0.8796
----

Querying for question: What are the transparency obligations for online platforms regarding their algorithms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 83 from DSA:
They should provide access to such researchers including, where technically possible, in real-time, to the publicly accessible data, for example on aggregated interactions with content from public pages, public groups, or public figures, including impression and engagement data such as the number of reactions, shares, comments from recipients of the service. Providers of very large online platforms or of very large online search engines should be encouraged to cooperate with researchers and provide broader access to data for monitoring societal concerns through voluntary efforts, including through commitments and procedures agreed under codes of conduct or crisis protocols. Those providers and researchers should pay particular attention to the protection of personal data, and ensure that any processing of personal data complies with Regulation (EU) 2016/679. Providers should anonymise or pseudonymise personal data except in those cases that would render imp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9321
Semantic Similarity with reference answer: 0.9248
----

Querying for question: How does the DSA address the issue of disinformation and fake news on digital platforms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 80 from DSA:
Very large online platforms or very large online search engines should ensure public access to repositories of advertisements presented on their online interfaces to facilitate supervision and research into emerging risks brought about by the distribution of advertising online, for example in relation to illegal advertisements or manipulative techniques and disinformation with a real and foreseeable negative impact on public health, public security, civil discourse, political participation and equality. Repositories should include the content of advertisements, including the name of the product, service or brand and the subject matter of the advertisement, and related data on the advertiser, and, if different, the natural or legal person who paid for the advertisement, and the delivery of the advertisement, in particular where targeted advertising is concerned. This information should include both information about targeting criteria and delivery criteria, 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9277
Semantic Similarity with reference answer: 0.9052
----

Querying for question: What role do trusted flaggers play under the DSA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 47 from DSA:
Such trusted flagger status should only be awarded to entities, and not individuals, that have demonstrated, among other things, that they have particular expertise and competence in tackling illegal content and that they work in a diligent, accurate and objective manner. Such entities can be public in nature, such as, for terrorist content, internet referral units of national law enforcement authorities or of the European Union Agency for Law Enforcement Cooperation (‘Europol’) or they can be non-governmental organisations and private or semi-public bodies such as the organisations part of the INHOPE network of hotlines for reporting child sexual abuse material and organisations committed to notifying illegal racist and xenophobic expressions online. To avoid diminishing the added value of such mechanism, the overall number of trusted flaggers awarded in accordance with this Regulation should be limited. In particular, industry associations representing th

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9399
Semantic Similarity with reference answer: 0.9192
----

Querying for question: How does the DSA promote the accountability of online platforms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8724
Semantic Similarity with reference answer: 0.8807
----

Querying for question: What are the penalties for non-compliance with the DSA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.8943
Semantic Similarity with reference answer: 0.9033
----

Querying for question: How does the DSA address the issue of illegal goods, services, and content online?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9050
Semantic Similarity with reference answer: 0.9024
----

Querying for question: How does the DSA support the rights of consumers in the digital marketplace?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 1 from DSA:
A majority of Union citizens now uses those services on a daily basis. However, the digital transformation and increased use of those services has also resulted in new risks and challenges for individual recipients of the relevant service, companies and society as a whole. (2) Member States are increasingly introducing, or are considering introducing, national laws on the matters covered by this Regulation, imposing, in particular, diligence requirements for providers of intermediary services as regards the way they should tackle illegal content, online disinformation or other societal risks. Those diverging national laws negatively affect the internal market, which, pursuant to Article 26 of the Treaty on the Functioning of the European Union (TFEU), comprises an area without internal frontiers in which the free movement of goods and services and freedom of establishment are ensured, taking into account the inherently cross-border nature of the internet, wh

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9251
Semantic Similarity with reference answer: 0.9231
----

Querying for question: How does the DSA handle the issue of online harassment and abuse?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 68 from DSA:
(81) A second category concerns the actual or foreseeable impact of the service on the exercise of fundamental rights, as protected by the Charter, including but not limited to human dignity, freedom of expression and of information, including media freedom and pluralism, the right to private life, data protection, the right to non-discrimination, the rights of the child and consumer protection. Such risks may arise, for example, in relation to the design of the algorithmic systems used by the very large online platform or by the very large online search engine or the misuse of their service through the submission of abusive notices or other methods for silencing speech or hampering competition. When assessing risks to the rights of the child, providers of very large online platforms and of very large online search engines should consider for example how easy it is for minors to understand the design and functioning of the service, as well as how minors can

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9257
Semantic Similarity with reference answer: 0.8959
----

Querying for question: How does the DSA ensure that users have control over their data and privacy?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 83 from DSA:
They should provide access to such researchers including, where technically possible, in real-time, to the publicly accessible data, for example on aggregated interactions with content from public pages, public groups, or public figures, including impression and engagement data such as the number of reactions, shares, comments from recipients of the service. Providers of very large online platforms or of very large online search engines should be encouraged to cooperate with researchers and provide broader access to data for monitoring societal concerns through voluntary efforts, including through commitments and procedures agreed under codes of conduct or crisis protocols. Those providers and researchers should pay particular attention to the protection of personal data, and ensure that any processing of personal data complies with Regulation (EU) 2016/679. Providers should anonymise or pseudonymise personal data except in those cases that would render imp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9024
Semantic Similarity with reference answer: 0.9128
----

Querying for question: How does the DSA address the issue of algorithmic transparency and accountability?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 74 from DSA:
In selecting the appropriate mitigation measures, providers can consider, where appropriate, industry best practices, including as established through self-regulatory cooperation, such as codes of conduct, and should take into account the guidelines from the Commission. (90) Providers of very large online platforms and of very large online search engines should ensure that their approach to risk assessment and mitigation is based on the best available information and scientific insights and that they test their assumptions with the groups most impacted by the risks and the measures they take. To this end, they should, where appropriate, conduct their risk assessments and design their risk mitigation measures with the involvement of representatives of the recipients of the service, representatives of groups potentially impacted by their services, independent experts and civil society organisations. They should seek to embed such consultations into their meth

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9276
Semantic Similarity with reference answer: 0.9234
----

Querying for question: What are the requirements for online platforms to cooperate with regulatory authorities under the DSA?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 181 from DSA:
Providers of very large online platforms and of very large online search engines shall put in place reasonable, proportionate and effective mitigation measures, tailored to the specific systemic risks identified pursuant to Article 34, with particular consideration to the impacts of such measures on fundamental rights.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9143
Semantic Similarity with reference answer: 0.8987
----

Querying for question: How does the DSA promote the development of codes of conduct for online platforms?




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk 89 from DSA:
The mere fact of participating in and implementing a given code of conduct should not in itself presume compliance with this Regulation. (105) The codes of conduct should facilitate the accessibility of very large online platforms and very large online search engines, in compliance with Union and national law, in order to facilitate their foreseeable use by persons with disabilities. In particular, the codes of conduct could ensure that the information is presented in a perceivable, operable, understandable and robust way and that forms and measures provided pursuant to this Regulation are made available in a manner that is easy to find and accessible to persons with disabilities. (106) The rules on codes of conduct under this Regulation could serve as a basis for already established self-regulatory efforts at Union level, including the Product Safety Pledge, the Memorandum of understanding on the sale of counterfeit goods on the internet, the Code of condu

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity with reference answer: 0.9268
Semantic Similarity with reference answer: 0.9182
----



In [15]:
from collections import defaultdict
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the summarization pipeline
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum",device=0)

# Load models for embedding generation
cosine_model = SentenceTransformer('BAAI/bge-large-en')  # Replace with your cosine similarity model
semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Replace with your semantic similarity model

# Function to summarize text with retry logic
def summarize_text_huggingface_with_retry(text, max_length=350, min_length=100, max_retries=3):
    for attempt in range(max_retries):
        try:
            summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                print("Max retries reached. Returning None.")
                return None

# Function to compute cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

# Add summaries, embeddings, and similarities to laws_info
for law, info in laws_info.items():
    print(f"Processing {law.upper()}...")

    for qa in info['questions_answers']:
        question = qa['question']
        retrieved_text = qa['answer']
        
        # Step 1: Generate summary
        summary = summarize_text_huggingface_with_retry(retrieved_text)
        qa['summary'] = summary  # Add summary to the question-answer entry
        
        # Step 2: Generate embeddings for answer and summary
        if summary:
            answer_embedding = cosine_model.encode(retrieved_text, convert_to_tensor=False)
            summary_embedding = cosine_model.encode(summary, convert_to_tensor=False)
            qa['answer_embedding'] = answer_embedding
            qa['summary_embedding'] = summary_embedding
            
            # Step 3: Calculate cosine similarity
            cosine_similarity = calculate_cosine_similarity(answer_embedding, summary_embedding)
            qa['cosine_similarity'] = cosine_similarity
            
            # Step 4: Calculate semantic similarity
            semantic_similarity = util.pytorch_cos_sim(
                semantic_model.encode(retrieved_text, convert_to_tensor=True),
                semantic_model.encode(summary, convert_to_tensor=True)
            ).item()
            qa['semantic_similarity'] = semantic_similarity
            
            print(f"Question: {question}")
            print(f"Summary: {summary}")
            print(f"Cosine Similarity: {cosine_similarity:.4f}")
            print(f"Semantic Similarity: {semantic_similarity:.4f}")
        else:
            print(f"No summary generated for question: {question}")

# Compute and store average similarities for each law
averages = {}
for law, info in laws_info.items():
    total_cosine = 0
    total_semantic = 0
    count = 0

    for qa in info['questions_answers']:
        if 'cosine_similarity' in qa and 'semantic_similarity' in qa:
            total_cosine += qa['cosine_similarity']
            total_semantic += qa['semantic_similarity']
            count += 1
    
    if count > 0:
        avg_cosine = total_cosine / count
        avg_semantic = total_semantic / count
        averages[law] = {
            'average_cosine_similarity': avg_cosine,
            'average_semantic_similarity': avg_semantic
        }
        print(f"\nAverage Cosine Similarity for {law.upper()}: {avg_cosine:.4f}")
        print(f"Average Semantic Similarity for {law.upper()}: {avg_semantic:.4f}")
    else:
        print(f"\nNo valid similarity scores for {law.upper()}.")


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Your max_length is set to 350, but your input_length is only 134. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)


Processing GDPR...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Question: What is the fundamental right regarding the processing of personal data as per the Charter of Fundamental Rights of the European Union?
Summary: The protection of natural persons in relation to the processing of personal data is a fundamental right under Article 8 of the Charter of Fundamental Rights of the European Union and Article 16 of the Treaty on the Functioning of the EU. This Regulation is intended to contribute to the accomplishment of an area of freedom, security, and justice and of an economic union, to economic and social progress, to the strengthening and convergence of the economies within the internal market, and to the well-being of natural people.
Cosine Similarity: 0.9806
Semantic Similarity: 0.9551


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does GDPR aim to balance the right to the protection of personal data with other fundamental rights?
Summary: This Regulation respects all fundamental rights. The right to the protection of personal data must be balanced against other fundamental rights, in accordance with the principle of proportionality. It observes the freedoms and principles recognized in the Charter as enshrined in the Treaties, such as respect for private and family life, home and communications, protection of data and freedom of thought, conscience and religion. It also observes freedom of expression and information, freedom to conduct a business, the right to an effective remedy and to a fair trial.
Cosine Similarity: 0.9858
Semantic Similarity: 0.9075


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: What challenges have arisen due to technological developments and globalization in the context of personal data protection?
Summary: Technology allows private companies and public authorities to make use of personal data on an unprecedented scale. Technology has transformed both the economy and social life. The Union should facilitate the free flow of data within the Union and the transfer to third countries and international organizations, while ensuring a high level of the protection of the personal data, as it has done in the past.    “Technological developments and globalization have brought new challenges for the Protection of Personal Data”. 
Cosine Similarity: 0.9712
Semantic Similarity: 0.7804


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: How does the GDPR address the transfer of personal data to third countries or international organizations?
Summary: The transfer of personal data to third countries or international organizations is allowed only where the conditions laid down in this Regulation are met. This Regulation is without prejudice to international agreements concluded between the Union and third countries regulating the transfer of data. It is also necessary to have appropriate safeguards for the data subjects. The level of protection of natural persons guaranteed by this Regulation is not undermined. The transfers to the third countries and international organizations may only be carried out in full compliance with this Regulation.  
Cosine Similarity: 0.9930
Semantic Similarity: 0.9833


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Question: What specific protections does GDPR offer to children regarding their personal data?
Summary: Children merit specific protection with regard to their personal data as they may be less aware of the risks, consequences, safeguards, and rights in relation to the processing of personal data. The consent of the holder of parental responsibility should not be necessary in the context of preventive or counselling services offered directly to a child. It should not apply to the use of data of children for the purposes of marketing or creating personality or user profiles and the collection of data with regards to children when using services offered to children.
Cosine Similarity: 0.9883
Semantic Similarity: 0.9677


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 134. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)


Question: How does the GDPR define personal data, and what are some examples?
Summary: Personal data under the GDPR is defined as any information relating to an identified or identifiable natural person (‘data subject’). Examples include a person’s name, identification number, location data, online identifier, or one or more factors specific to the physical, physiological, genetic, mental, economic, cultural, or social identity of that natural person. The definition is broad, capturing various forms of data that could be used to directly or indirectly identify an individual. 
Cosine Similarity: 1.0000
Semantic Similarity: 1.0000


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Question: What is the legal basis for processing personal data under the GDPR?
Summary: The GDPR outlines several legal bases for processing personal data. The data subject has given consent to the processing. The processing is necessary for the purposes of the legitimate interests pursued by the controller or a third party, except where such interests are overridden by the interests or fundamental rights and freedoms of the data subject. It is also necessary for compliance with a legal obligation or for the performance of a task carried out in the public interest or in the exercise of official authority. It's necessary to protect the vital interests of a data subject or another natural person.
Cosine Similarity: 0.9776
Semantic Similarity: 0.9753


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Question: What are the rights of data subjects under the GDPR?
Summary: The GDPR grants data subjects several rights, including the right to be informed, access, rectification, erasure, and portability. These rights empower individuals to have control over their personal data and ensure transparency and accountability in data processing. The rights in relation to automated decision-making and profiling are also included in the GDPR. The right to erasure (‘right to be forgotten’) is not included in GDPR, but it can still be exercised by individuals.
Cosine Similarity: 0.9772
Semantic Similarity: 0.9561


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


Question: How does the GDPR address data protection by design and by default?
Summary: The GDPR requires data controllers to implement data protection by design and by default. Data protection measures must be integrated into the processing activities from the outset. The controller must take appropriate technical and organizational measures, such as pseudonymization, to ensure that, by default, personal data is not made accessible to an indefinite number of people without the individual's consent. The only personal data necessary for each specific purpose of the processing is processed, and it must be processed only for that purpose.   
Cosine Similarity: 0.9975
Semantic Similarity: 0.9958


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your max_length is set to 350, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


Question: What is the role of the Data Protection Officer (DPO) under the GDPR?
Summary: The Data Protection Officer (DPO) is responsible for overseeing data protection strategies and ensuring compliance with GDPR requirements. The DPO’s responsibilities include advising the organization on GDPR obligations, monitoring compliance, providing training to staff, conducting audits and serving as the contact point for supervisory authorities and data subjects. DPOs must be appointed by public authorities and by organizations that engage in regular and systematic monitoring of data subjects on a large scale or process special categories of data.
Cosine Similarity: 0.9977
Semantic Similarity: 0.9943


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Question: What are the implications of the GDPR for cross-border data processing activities?
Summary: The GDPR establishes a framework for cross-border data processing activities to ensure that data protection is consistent across the EU. Organizations that process personal data across multiple EU member states must designate a lead supervisory authority, which acts as the single point of contact for overseeing compliance. The GDPR also facilitates cooperation between supervisory authorities through mechanisms such as the consistency mechanism and the European Data Protection Board (ED PBAs).    .  .   for confidential support call the National Data Protection Hotline on 1-800-273-8255.
Cosine Similarity: 0.9766
Semantic Similarity: 0.9892


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Question: How does the GDPR handle data breaches, and what are the obligations of data controllers in such cases?
Summary: Under the GDPR, data controllers are required to report data breaches to the relevant supervisory authority within 72 hours of becoming aware of the breach. If the breach poses a high risk to the affected individuals, the data controller must also inform the data subjects without undue delay. The GDPR mandates that organizations implement appropriate technical and organizational measures to prevent data breaches and mitigate their impact under the circumstances of a data breach, as well as to prevent its impact on the rights and freedoms of individuals.
Cosine Similarity: 0.9968
Semantic Similarity: 0.9910


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: What are the restrictions on processing special categories of personal data under the GDPR?
Summary: The GDPR imposes stricter rules on processing special categories of personal data, such as data revealing racial or ethnic origin, political opinions, religious or philosophical beliefs, trade union membership, genetic data, biometric data, health data, and data concerning a person’s sex life or sexual orientation. Processing of such data is prohibited unless specific conditions are met. These conditions include obtaining explicit consent from the data subject, fulfilling legal obligations in the field of employment and social security, or protecting the vital interests of the data subjects.
Cosine Similarity: 0.9989
Semantic Similarity: 0.9989


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Question: How does the GDPR regulate automated decision-making and profiling?
Summary: The GDPR places restrictions on automated decision-making, including profiling. Data subjects have the right to contest automated decisions and seek human intervention. Organizations must ensure that data subjects are informed about the existence of automated decisions, the logic involved, and the potential consequences. The processing is permitted only in specific situations, such as when it is necessary for entering into or performing a contract, authorized by Union or Member State law, or based on the data subject’s explicit consent, according to the GDPR.
Cosine Similarity: 0.9900
Semantic Similarity: 0.9364


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Question: What penalties and enforcement actions are provided for under the GDPR?
Summary: The GDPR provides for substantial penalties and enforcement actions to ensure compliance. Supervisory authorities have the power to impose administrative fines of up to 20 million euros or 4% of the total worldwide annual turnover of the preceding financial year. Penalties are determined based on factors such as the nature, gravity, and duration of the infringement, the intentional or negligent character of the infringment, and the measures taken by the organization to mitigate the damage. The penalties can be higher for the most serious violations.
Cosine Similarity: 0.9983
Semantic Similarity: 0.9968


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Question: What is the role of the European Data Protection Board (EDPB) under the GDPR?
Summary: The European Data Protection Board (EDPB) was established by the GDPR to ensure the consistent application of data protection rules across the EU. Its responsibilities include issuing guidelines, recommendations and best practices on the interpretation and application of GDPR. It also resolves disputes between supervisory authorities and advises the European Commission on data protection matters. EDPB is composed of representatives of the national data protection authorities and the EU Data Protection Supervisor (EDPS). It was established as a result of the EU's Data Protection Directive.
Cosine Similarity: 0.9894
Semantic Similarity: 0.9584


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: How does the GDPR address the issue of consent in data processing?
Summary: Under the GDPR, consent must be freely given, specific, informed, and unambiguous. Organizations must ensure that consent is obtained through a clear affirmative action, such as ticking a box on a website. The data subject must be informed of their right to withdraw consent at any time. For children below the age of 16, parental consent is required for processing their data. The consent should be as easy as giving the consent.    i   in this case, the consent was given by the data subject.
Cosine Similarity: 0.9915
Semantic Similarity: 0.9858


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: What is the GDPR’s approach to international data transfers?
Summary: The GDPR allows international data transfers only if the third country, territory, or international organization ensures an adequate level of data protection. In the absence of an adequacy decision, transfers are permitted under appropriate safeguards. In specific circumstances, derogations for specific situations, such as explicit consent of the data subject, may allow transfers. The GDPR aims to ensure that personal data transferred outside the EU is afforded the same level of protection as within the EU. It can only be transferred under binding corporate rules or standard contractual clauses.
Cosine Similarity: 0.9945
Semantic Similarity: 0.9919


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Question: What rights do data subjects have in relation to automated decision-making under the GDPR?
Summary: Under the GDPR, data subjects have the right not to be subject to a decision based solely on automated processing. Exceptions include situations where automated decision-making is necessary for entering into or performing a contract, authorized by Union or Member State law, or based on explicit consent. Organizations must implement safeguards to protect the data subject's rights, such as the right to obtain human intervention, express their point of view, and contest the decision. The right to get involved in the decision is also protected under GDPR.
Cosine Similarity: 0.9712
Semantic Similarity: 0.9819


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


Question: What is the GDPR's stance on the appointment of a Data Protection Officer (DPO) and when is it mandatory?
Summary: The GDPR mandates the appointment of a Data Protection Officer (DPO) in specific cases. The DPO must have expert knowledge of data protection law and practices and is responsible for advising the organization on GDPR compliance and monitoring its implementation. DPOs are appointed when processing is carried out by a public authority or body, except for courts acting in their judicial capacity, or when the core activities of the controller or processor require regular and systematic monitoring of data subjects on a large scale.
Cosine Similarity: 0.9843
Semantic Similarity: 0.9714
Processing AI_ACT...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Question: What are the main objectives of the AI Act concerning the development and use of AI in the European Union?
Summary: The AI Act aims to ensure that AI systems placed on the market and used in the Union are safe, respect existing law on fundamental rights and Union values, and do not undermine fundamental rights. It also aims to enhance transparency, accountability, and trust in AI while promoting innovation and competitiveness. It aims to establish a legal framework that addresses the risks posed by AI, in particular high-risk AI systems, and aims to improve transparency and accountability in the AI industry. It is aimed at promoting innovation, competitiveness and transparency.
Cosine Similarity: 0.9904
Semantic Similarity: 0.9910


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)


Question: How does the AI Act propose to regulate high-risk AI systems?
Summary: The AI Act classifies AI systems based on the risk they pose. High-risk AI systems include those used in critical infrastructure, education, employment, essential public and private services, law enforcement, and migration. These systems must comply with requirements related to risk management, data governance, technical documentation, record-keeping, transparency, provision of information to users, human oversight, accuracy, and robustness. Providers of these systems must establish a quality management system and ensure continuous monitoring and post-market surveillance.
Cosine Similarity: 0.9945
Semantic Similarity: 0.9936


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)


Question: What responsibilities does the AI Act place on AI providers to ensure ethical AI practices?
Summary: Providers of high-risk AI systems are responsible for ensuring their systems comply with the requirements set out in the Act. This includes the obligation to conduct a conformity assessment before placing the system on the market, ensure the system undergoes proper testing, provide clear instructions and information to users, implement human oversight measures, and monitor the system throughout its lifecycle. Providers must also report serious incidents and malfunctions to the authorities, as well as report them to the law enforcement agencies.
Cosine Similarity: 0.9965
Semantic Similarity: 0.9971


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Question: How does the AI Act address transparency and accountability in AI systems?
Summary: The AI Act requires high-risk AI systems to be transparent and provide clear information about their purpose, capabilities, and limitations. It also requires them to be designed with features that ensure accountability, including auditability, traceability of decisions, and the ability to provide explanations for decisions made by the AI. Users should be able to understand how decisions are made by AI systems and what data is being processed, according to the Act. It is also necessary to provide them with explanations for their decisions.
Cosine Similarity: 0.9930
Semantic Similarity: 0.9714


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Question: What measures are suggested by the AI Act to protect fundamental rights in the deployment of AI technologies?
Summary: The Act requires AI systems to be designed and used in a manner consistent with respect for human dignity, privacy, non-discrimination, and other fundamental rights. It also requires embedding human oversight mechanisms, ensuring that AI systems do not lead to biased or discriminatory outcomes, and providing avenues for individuals to contest decisions made by AI systems that affect them significantly. It promotes the development of codes of conduct and voluntary measures by providers to ensure that AI is used ethically and in alignment with societal values.
Cosine Similarity: 0.9690
Semantic Similarity: 0.9100


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: What categories of AI systems are considered high-risk under the AI Act?
Summary: High-risk AI systems under the AI Act include those used in critical infrastructure, educational and vocational training, employment and worker management, access to essential private and public services, law enforcement, migration, asylum, and border control management. These systems are subject to stringent requirements due to the significant risks they pose to fundamental rights and safety.    i   in this context, we are talking about AI systems that pose a significant risk to human rights and the safety of human beings. 
Cosine Similarity: 0.9713
Semantic Similarity: 0.9795


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: How does the AI Act define 'AI system' and what technologies fall under this definition?
Summary: The AI Act defines an AI system as software developed with one or more of the techniques and approaches listed in the Act. The definition is broad and includes a variety of AI technologies, from simple algorithms to complex machine learning models. For a given set of human-defined objectives, AI systems can generate outputs such as content, predictions, recommendations, or decisions influencing the environments they interact with.    .   ,   in the AI Act, AI System is defined as software that is developed with some of the methods listed in it.
Cosine Similarity: 0.9848
Semantic Similarity: 0.9770


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 102. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


Question: What obligations do users of high-risk AI systems have under the AI Act?
Summary: Users of high-risk AI systems are required to operate the systems in accordance with the instructions provided by the AI system provider and promptly report any serious incidents or malfunctions to the provider and the competent authorities. Users are responsible for implementing measures to mitigate risks to fundamental rights and safety. Users must keep logs generated by AI system, ensure that human oversight is maintained, and ensure that the AI System is used only for its intended purpose.       .   i  
Cosine Similarity: 0.9932
Semantic Similarity: 0.9941


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


Question: How does the AI Act address the use of biometric identification systems?
Summary: The AI Act imposes strict regulations on the use of biometric identification systems in public spaces for law enforcement purposes. The use of real-time remote biometric ID systems in publicly accessible spaces is generally prohibited, with exceptions granted under specific conditions, such as preventing a terrorist attack, locating a missing child, or identifying a suspect of a serious crime. Even in these cases, the use must be authorized by judicial or other independent authorities, and subject to strict safeguards to protect fundamental rights.
Cosine Similarity: 0.9986
Semantic Similarity: 0.9878


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


Question: What are the requirements for conformity assessments under the AI Act?
Summary: High-risk AI systems must undergo a conformity assessment before they can be placed on the market or put into service. The assessment can be conducted by the provider or by a notified body depending on the nature of the AI system. The conformity assessment must be documented and the system must bear a CE marking to indicate compliance with the regulation.      Â   .   i   for confidential support call the National AI Council on 1-800-273-8255. 
Cosine Similarity: 0.9622
Semantic Similarity: 0.9534


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Question: What role do national supervisory authorities play under the AI Act?
Summary: National supervisory authorities are responsible for overseeing the implementation and enforcement of the AI Act within their respective jurisdictions. They are tasked with monitoring the compliance of AI systems with the Act's requirements, conducting inspections and investigations, and taking enforcement actions where necessary. They also play a key role in coordinating with other national authorities and the European Commission to ensure a harmonized approach to AI regulation across the EU. For confidential support call the Samaritans on 08457 90 90 90, visit a local Samaritans branch or click here.
Cosine Similarity: 0.9676
Semantic Similarity: 0.9204


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Question: How does the AI Act encourage innovation while ensuring safety and compliance?
Summary: The AI Act provides regulatory sandboxes where AI developers can test their systems under the supervision of competent authorities without facing the full regulatory requirements. The Act also promotes the adoption of voluntary codes of conduct for non-high-risk AI systems to demonstrate their commitment to ethical AI practices. It allows for experimentation and development of innovative AI solutions while ensuring that safety, ethical, and legal standards are maintained. It also allows for the development of new AI solutions without the need to face the full regulation requirements.
Cosine Similarity: 0.9750
Semantic Similarity: 0.9452


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: How does the AI Act address the transparency of AI systems?
Summary: The AI Act requires high-risk AI systems to be designed and developed with transparency in mind. This includes providing clear and accessible information to users about the AI system’s purpose, capabilities, limitations and how it functions. Users must be informed when they are interacting with an AI system, especially in cases where the AI is used to make decisions with significant impacts on individuals. The transparency requirements are aimed at ensuring that users and affected individuals understand how and why decisions are made by AI systems.
Cosine Similarity: 0.9974
Semantic Similarity: 0.9765


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: What are the obligations related to data quality under the AI Act?
Summary: The AI Act requires high-risk AI systems to be trained, tested and validated using high-quality datasets that are relevant, representative, free of errors and complete. The data must be carefully selected to avoid biases that could lead to discriminatory outcomes. Providers must ensure that the data governance framework includes measures to assess and mitigate risks related to data quality, such as using diverse and representative datasets, validating the accuracy and reliability of data, and regularly updating datasets to reflect changes over time.
Cosine Similarity: 0.9992
Semantic Similarity: 0.9978


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: How does the AI Act regulate the use of AI in law enforcement and public safety?
Summary: The AI Act imposes strict regulations on the use of AI systems in law enforcement and public safety, particularly those used for predictive policing, biometric identification and surveillance. Law enforcement agencies must conduct a detailed risk assessment and implement safeguards to ensure that the use is necessary, proportionate, and respectful of human rights. The AI Act considers high-risk AI systems to be subject to rigorous scrutiny to ensure they do not infringe on fundamental rights, such as privacy and non-discrimination. 
Cosine Similarity: 0.9952
Semantic Similarity: 0.9857


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Question: How does the AI Act address the issue of bias and discrimination in AI systems?
Summary: The AI Act requires AI systems to be designed and developed in a manner that prevents, identifies, and mitigates biases that could lead to discriminatory outcomes. Providers must take measures to ensure that AI systems do not produce results that unfairly disadvantage individuals or groups based on protected characteristics such as race, gender, or religion. This includes using diverse datasets, conducting bias audits, and implementing corrective measures to address any identified biases. The Act also emphasizes the importance of human oversight in preventing and addressing bias.
Cosine Similarity: 0.9955
Semantic Similarity: 0.9725


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


Question: What is the role of the European Artificial Intelligence Board (EAIB) under the AI Act?
Summary: The European Artificial Intelligence Board (EAIB) was established under the AI Act to facilitate cooperation and coordination among national supervisory authorities and the European Commission. The EAIB is responsible for issuing guidelines, recommendations, and best practices on the implementation of the Act. It also provides advice to the EU Commission on AI-related matters. EAIB also plays a role in resolving disputes between national authorities and ensuring consistency in the interpretation and enforcement of the act.       .  
Cosine Similarity: 0.9976
Semantic Similarity: 0.9951


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Question: How does the AI Act impact the use of AI in healthcare?
Summary: The AI Act recognizes the potential benefits of AI in healthcare, but also the risks associated with the use of AI. AI systems used in healthcare are classified as high-risk and subject to strict requirements. These include ensuring accuracy and reliability of AI systems, maintaining human oversight, and safeguarding patient data. The Act also emphasizes the importance of transparency and informed consent in the using AI in the healthcare sector, as well as the need to be transparent and informed about the process of consenting to its use.
Cosine Similarity: 0.9924
Semantic Similarity: 0.9671


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the AI Act address the issue of AI literacy and public awareness?
Summary: The AI Act encourages initiatives to promote AI literacy and public awareness. It also promotes public consultations and stakeholder engagement to ensure that the perspectives of various groups, including civil society, are considered in the development and deployment of AI systems. The Act calls for the development of educational programs and resources to help individuals understand the capabilities, limitations, and risks associated with AI.       .   and it also promotes the public consultation and engagement on AI systems on behalf of civil society.
Cosine Similarity: 0.9835
Semantic Similarity: 0.9793


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


Question: What measures does the AI Act include to support the ethical development of AI?
Summary: The AI Act supports the ethical development of AI. It encourages the adoption of voluntary codes of conduct, fostering research on ethical AI and promoting the development of systems that align with European values and fundamental rights. It also supports the creation of regulatory sandboxes to allow developers to experiment with innovative AI solutions in a controlled environment. It ensures that ethical considerations are integrated into the design and deployment of AI technologies. It emphasizes the importance of human-centric AI where AI systems enhance human capabilities and well-being while respecting human dignity and autonomy.
Cosine Similarity: 0.9917
Semantic Similarity: 0.9937
Processing DMA...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Question: What criteria are used to define a 'gatekeeper' under the Digital Markets Act?
Summary: A gatekeeper is defined as a provider of core platform services that has a significant impact on the internal market and serves as an important gateway for business users to reach end users. The criteria include having a strong economic position, a large number of users, and control over an ecosystem that is difficult for other companies to contest. The gatekeeper has to enjoy an entrenched and durable position in the market and has to have a significant economic position and a significant influence on the ecosystem.  
Cosine Similarity: 0.9589
Semantic Similarity: 0.8011


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


Question: How does the DMA propose to regulate the behavior of gatekeepers in digital markets?
Summary: The DMA imposes obligations on gatekeepers to prevent them from engaging in unfair practices that harm competition and consumers. Gatekeepers are also required to provide data portability, offer fair terms to business users, and ensure transparency in their operations. The DMA also prohibits gatekeepers from favoring their own services over those of competitors (self-preferencing) and allows them to allow interoperability with third-party services (interoperability). It also prohibits them from unfairly limiting access to their platforms.
Cosine Similarity: 0.9928
Semantic Similarity: 0.9880


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Question: What are the key obligations imposed on gatekeepers by the DMA?
Summary: The key obligations for gatekeepers under the DMA include prohibitions on combining personal data from different sources without user consent, restrictions on pre-installing software or apps, and requirements to allow business users access to data generated on their platform. Gatekeepers must also ensure that their platforms are open and interoperable with third-party services and they are prohibited from using non-public data from their business users to compete against them.      Â   .  
Cosine Similarity: 0.9985
Semantic Similarity: 0.9994


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Question: How does the DMA aim to prevent unfair practices in the digital market?
Summary: The DMA aims to prevent unfair practices by setting out clear rules for gatekeepers. The European Commission is empowered to investigate and sanction gatekeepers that do not comply with these rules. The rules prohibit self-preferencing, restrictions on unfair terms and conditions for business users and requirements for transparency in how gatekeepers operate. The DMA also ensures that gatekeepers cannot use their dominant position to stifle competition or innovation by smaller firms. The EU Commission can investigate and sanctions gatekeepers who violate these rules and does not comply.
Cosine Similarity: 0.9930
Semantic Similarity: 0.9578


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: What enforcement mechanisms are included in the DMA to ensure compliance by gatekeepers?
Summary: The DMA includes robust enforcement mechanisms. The European Commission can impose fines of up to 10% of the gatekeeper’s total worldwide annual turnover for non-compliance. The Commission can also impose additional penalties, including structural remedies, such as the divestiture of businesses. The DMA allows for periodic penalty payments to ensure that gatekeepers comply with the obligations and prohibitions set out in the DMA.    .   , a European Commission spokesperson confirmed that the Commission is not aware of this information. 
Cosine Similarity: 0.9850
Semantic Similarity: 0.9655


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: How does the DMA address the issue of self-preferencing by gatekeepers?
Summary: The DMA prohibits gatekeepers from favoring their own products or services over those of competitors on their platforms. The aim is to ensure a level playing field in digital markets based on merit rather than market power of the gatekeeper. The prohibition on self-preferencing is one of the key obligations imposed on gatekeepers to prevent anti-competitive behavior, as explained by the DMA's regulations.       in the regulation, the gatekeepers can't favor their products over others. 
Cosine Similarity: 0.9821
Semantic Similarity: 0.9612


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


Question: What are the criteria for identifying core platform services under the DMA?
Summary: Core platform services under the DMA include a range of digital services that serve as important gateways for business users to reach end users. These services include online intermediation services, such as app stores and marketplaces, online search engines, social networking services, video-sharing platform services, number-independent interpersonal communication services, operating systems, cloud computing services, and advertising services. A service is considered a core platform service if it has a significant impact on the internal market and is an essential gateway for business user to access end users, according to DMA.
Cosine Similarity: 0.9993
Semantic Similarity: 0.9993


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: How does the DMA promote interoperability between digital services?
Summary: The DMA requires gatekeepers to ensure that their core platform services can interact with third-party services. The goal is to prevent gatekeepers from locking in users and business users to their platforms and to enable competition by allowing new entrants and smaller competitors to offer complementary or competing services. Interoperability is seen as a key measure to promote innovation and consumer choice in digital markets and is seen to promote competition in digital market. The necessary technical interfaces and documentation must be available to allow for interoperability.
Cosine Similarity: 0.9903
Semantic Similarity: 0.9313


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: What obligations does the DMA impose on gatekeepers regarding data access and portability?
Summary: The DMA imposes obligations on gatekeepers to provide business users and end users with access to the data generated through their interactions on the platform. This includes providing data in a structured, commonly used, and machine-readable format to facilitate data portability. Gatekeepers are also required to allow business users access to data that is necessary for the development and improvement of their own products and services. These obligations are intended to prevent gatekeepers from using their control over data to stifle competition and innovation.
Cosine Similarity: 0.9999
Semantic Similarity: 1.0000


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Question: How does the DMA address the issue of tying and bundling practices by gatekeepers?
Summary: The DMA prohibits gatekeepers from tying and bundling practices that require users to purchase or use additional services as a condition for accessing the gatekeeper's core platform service. The prohibition is intended to prevent gatekeepers leveraging their market power to extend their dominance into other markets and to ensure that users have the freedom to choose the services they want to use. For example, a gatekeeper cannot require a user to install or use a specific app or service as a precondition for using their platform.
Cosine Similarity: 0.9928
Semantic Similarity: 0.9888


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


Question: What are the consequences for gatekeepers that fail to comply with the DMA?
Summary: The DMA is designed to prevent gatekeepers from engaging in anti-competitive behavior. Gatekeepers that fail to comply with the obligations and prohibitions set out in the DMA face fines of up to 10% of their total worldwide annual turnover and structural remedies, such as the divestiture of parts of the business. The DMA also provides for periodic penalty payments to ensure that gatekeepers comply with obligations on an ongoing basis. The European Commission can impose additional measures in cases of repeated non-compliance.
Cosine Similarity: 0.9695
Semantic Similarity: 0.9348


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


Question: How does the DMA enhance consumer protection in digital markets?
Summary: The DMA promotes transparency in how gatekeepers operate. It also promotes interoperability and data portability. It aims to improve the quality and affordability of digital services for consumers by fostering competition and promoting choice and control over the digital services they use. It ensures that gatekeepers do not engage in practices that harm consumers, such as self-preferencing, unfair terms and conditions, or limiting access to data. It requires them to provide clear and accessible information about their practices.
Cosine Similarity: 0.9831
Semantic Similarity: 0.9220


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the DMA address the issue of access to business users' data by gatekeepers?
Summary: The DMA imposes obligations on gatekeepers to provide business users with access to the data they generate through their interactions on the platform. This includes access to aggregated and anonymized data, as well as data that is essential for the development and improvement of the business user's products and services. The DMA also prohibits gatekeepers from using non-public data from business users to compete against them. It ensures that gatekeepers don't exploit their access to data to gain an unfair competitive advantage.
Cosine Similarity: 0.9982
Semantic Similarity: 0.9994


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Question: How does the DMA ensure fair and non-discriminatory access to core platform services?
Summary: The DMA requires gatekeepers to ensure that their core platform services are offered on fair, reasonable and non-discriminatory terms. It also requires them to provide transparency in how they operate, including clear and accessible information about the terms and conditions for using their services. The measures are intended to prevent gatekeepers from abusing their market power and to ensure a level playing field in digital markets. The DMA is also trying to prevent unfair terms or conditions being imposed on business users.
Cosine Similarity: 0.9892
Semantic Similarity: 0.9744


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


Question: How does the DMA promote innovation and competition in digital markets?
Summary: The DMA promotes innovation and competition by preventing gatekeepers from engaging in practices that stifle competition. The DMA also promotes interoperability and data portability. These measures are designed to foster a dynamic and competitive digital market that benefits consumers and businesses alike.       .   for more information visit: www.dma.org/dma-online-competition.uk.  For more information on the DMA's policies, visit: http://www.dMA.org/.
Cosine Similarity: 0.9655
Semantic Similarity: 0.9476


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: How does the DMA address the issue of mergers and acquisitions by gatekeepers?
Summary: The DMA requires gatekeepers to inform the European Commission of any intended mergers, acquisitions, or concentrations involving other providers of core platform services or digital services. This notification requirement allows the Commission to assess whether the proposed transaction would undermine the objectives of the DMA. The DMA's provisions on mergers and acquisitions are intended to prevent gatekeepers from consolidating their dominance through strategic acquisitions and to ensure that competition remains robust in digital markets. The Commission will consider the notification if it is necessary.
Cosine Similarity: 0.9947
Semantic Similarity: 0.9939


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the DMA address the issue of dark patterns and deceptive design practices by gatekeepers?
Summary: The DMA prohibits gatekeepers from using dark patterns and deceptive design practices that manipulate or deceive users into making decisions that are not in their best interests. The DMA requires gatekeepers to provide clear and accessible information to users and to design their interfaces in a way that respects user autonomy and choice. The provisions are intended to protect consumers from manipulative practices and to ensure that digital services are transparent and user-friendly.       .   and to make sure they are user friendly. 
Cosine Similarity: 0.9859
Semantic Similarity: 0.9826


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


Question: How does the DMA promote transparency in digital advertising?
Summary: The DMA promotes transparency in digital advertising by requiring gatekeepers to provide advertisers and publishers with access to data related to their advertising campaigns. Gatekeepers must also ensure that their advertising services are offered on fair, reasonable, and non-discriminatory terms and they are prohibited from using non-public data to gain an unfair advantage in the advertising market. The provisions are intended to promote competition and transparency in the digital advertising market to ensure that advertisers have the information they need to make informed decisions.
Cosine Similarity: 0.9972
Semantic Similarity: 0.9929


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Question: How does the DMA address the issue of access to core platform services by end users?
Summary: The DMA ensures that end users have access to core platform services on fair and non-discriminatory terms. Gatekeepers are prohibited from restricting or degrading the quality of access to their services or from engaging in practices that limit user choice. The DMA also promotes data portability, allowing end users to transfer their data to other services and take advantage of competitive offerings. These provisions are designed to enhance user choice and control over the digital services they use and to promote competition in the market.
Cosine Similarity: 0.9942
Semantic Similarity: 0.9953


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


Question: What role does the European Commission play in enforcing the DMA?
Summary: The European Commission is responsible for enforcing the DMA. The Commission has the authority to impose fines, periodic penalty payments and structural remedies on gatekeepers that violate DMA's obligations and prohibitions. It also has the power to initiate market investigations to assess whether new services should be designated as core platform services or if additional obligations should be imposed. The DMA is designed to be robust and effective, ensuring that gatekeepers operate in a manner that promotes competition and innovation in digital markets. 
Cosine Similarity: 0.9888
Semantic Similarity: 0.9813
Processing DSA...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question: What are the main responsibilities of online platforms under the Digital Services Act?
Summary: Online platforms are responsible for taking effective measures to mitigate risks related to illegal content, ensure the safety of users and protect fundamental rights. Platforms must implement mechanisms for reporting and removing illegal content and provide users with clear terms and conditions. They are also required to assess and mitigate systemic risks, such as the spread of disinformation and harmful content, and deal with complaints and appeals. DSA requires platforms that reach a significant number of users to take effective measures.      
Cosine Similarity: 0.9945
Semantic Similarity: 0.9505


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the DSA aim to protect users from illegal content on digital platforms?
Summary: The DSA aims to protect users from illegal content. Platforms must act expeditiously to remove or disable access to illegal content upon receiving a notice. They must cooperate with law enforcement and provide transparency reports on their content moderation activities. They also must take proactive measures to prevent the spread of illegal content and ensure that their algorithms do not promote harmful or illegal content, according to the DSA's requirements.       .   dsa.org.uk.
Cosine Similarity: 0.9698
Semantic Similarity: 0.9686


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Question: What transparency requirements are imposed on online platforms by the DSA?
Summary: The DSA imposes extensive transparency requirements on online platforms. Platforms must publish transparency reports detailing the number of content removal actions and the outcomes of user appeals. They must also disclose how their content moderation systems and recommendation algorithms work. Users must be informed about the terms and conditions governing the use of the platform and any changes made to these terms. The platforms must also provide clear information about the advertising they serve, including the identity of advertisers and the targeting criteria used. The DSA requires platforms to publish these reports.
Cosine Similarity: 0.9926
Semantic Similarity: 0.9796


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)


Question: How does the DSA propose to handle the dissemination of harmful content?
Summary: The DSA proposes to handle the dissemination of harmful content by requiring platforms to assess the risks and take appropriate measures to mitigate these risks. Platforms must implement safeguards to ensure that their algorithms do not promote harmful content and provide users with tools to control the content they are exposed to. The DSA encourages platforms to cooperate with trusted flaggers and fact-checkers to identify and address harmful content more effectively. In cases where platforms fail to mitigate risks adequately, they may be subject to regulatory action, including fines and other penalties.
Cosine Similarity: 0.9954
Semantic Similarity: 0.9838


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: What measures does the DSA include to protect freedom of expression while combating illegal content?
Summary: The DSA includes measures to protect freedom of expression. Platforms must provide users with clear explanations when content is removed or access is restricted. The DSA also requires platforms to ensure that content moderation processes are fair and transparent, with safeguards in place to prevent the arbitrary removal of content. DSA encourages platforms to develop codes of conduct in collaboration with stakeholders to balance the need to combat illegal content with the protection of free speech. The code of conduct should be in line with the DSA's requirements.
Cosine Similarity: 0.9810
Semantic Similarity: 0.9787


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 130. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)


Question: How does the DSA address the issue of content moderation on online platforms?
Summary: The DSA requires online platforms to implement content moderation policies that are transparent, consistent and aligned with fundamental rights. Platforms must establish clear terms and conditions for content moderation and provide users with detailed information on how content is assessed, removed, or restricted. The DSA also mandates that platforms implement mechanisms for users to appeal content moderation decisions. These measures aim to create a fair and accountable content moderation system that respects freedom of expression while combating illegal content.    .   i  
Cosine Similarity: 0.9881
Semantic Similarity: 0.9959


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Question: What obligations do very large online platforms (VLOPs) have under the DSA?
Summary: VLOPs with more than 45 million users in the EU have additional obligations under the DSA. They must conduct annual risk assessments to identify and mitigate systemic risks. They are also required to provide greater transparency in their content recommendation algorithms, offer users more control over the content they see, and cooperate with authorities to prevent and address systemic risks, among other things. The obligations are intended to ensure that VLOP's operate in a manner that is safe, transparent, and respectful of fundamental rights.
Cosine Similarity: 0.9647
Semantic Similarity: 0.9363


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Question: How does the DSA enhance the protection of minors online?
Summary: The DSA includes provisions to enhance the protection of minors online. Platforms must implement measures to ensure that their services are safe for minors, including age-appropriate content moderation, parental controls and restrictions on targeted advertising to minors. They must also provide clear and accessible information to minors and their parents about the risks associated with online activities and how to protect themselves. The measures are designed to create a safer online environment for children and to empower them and their guardians to make informed decisions. 
Cosine Similarity: 0.9925
Semantic Similarity: 0.9858


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: What are the transparency obligations for online platforms regarding their algorithms?
Summary: The DSA imposes transparency obligations on online platforms to provide clear and accessible information about how their algorithms work. Platforms must explain the criteria and logic behind their algorithms. VLOPs have additional obligations to conduct algorithmic audits and to allow independent researchers to assess the impact of their algorithms on society. These transparency measures are intended to increase accountability and trust in the digital ecosystem and increase trust in online platforms.    .   and the DSA's transparency obligations. 
Cosine Similarity: 0.9888
Semantic Similarity: 0.9814


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: How does the DSA address the issue of disinformation and fake news on digital platforms?
Summary: The DSA requires platforms to take proactive measures to combat the spread of disinformation and fake news. This includes implementing mechanisms to detect, assess, and mitigate the risks associated with disinformation, collaborating with independent fact-checkers, and providing users with accurate information and context. Platforms must also ensure that their content moderation and recommendation systems do not amplify or promote disinformation. The DSA promotes transparency by requiring platforms to report on their efforts to combat disinformation and to provide users with tools to identify and report false information.
Cosine Similarity: 0.9845
Semantic Similarity: 0.9643


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Question: What role do trusted flaggers play under the DSA?
Summary: The DSA recognizes the role of trusted flaggers as important partners in content moderation. Trusted flaggers are granted priority in the notice-and-action mechanisms, so their reports are processed more quickly and with higher accuracy. Platforms must ensure that the flaggers' reports are handled by experienced moderators and receive feedback on the actions taken. The designation is intended to improve the efficiency and effectiveness of content moderation, particularly in combating illegal content and harmful activities online.  . 
Cosine Similarity: 0.9904
Semantic Similarity: 0.9753


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the DSA promote the accountability of online platforms?
Summary: The DSA promotes accountability by imposing rigorous reporting and transparency requirements on online platforms. VLOPs are required to undergo independent audits of their content moderation and risk management practices. The audits are intended to assess the platform's compliance with the DSA and to identify areas for improvement. The DSA aims to build trust in the digital environment and ensure that platforms act responsibly and are transparent and accountable.       .   and to ensure that they act responsibly. 
Cosine Similarity: 0.9752
Semantic Similarity: 0.8987


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Question: What are the penalties for non-compliance with the DSA?
Summary: The DSA provides for substantial penalties for non-compliance, including fines of up to 6% of the platform's total worldwide annual turnover. The enforcement of the DSA is overseen by national regulatory authorities, which have the power to investigate and sanction platforms that violate the regulation. The penalties are designed to ensure that platforms take their obligations seriously and that the provisions of DSA's provisions are effectively implemented. In cases of repeated or severe non- compliance, additional measures can be taken, such as temporary suspension of services.
Cosine Similarity: 0.9934
Semantic Similarity: 0.9872


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Question: How does the DSA address the issue of illegal goods, services, and content online?
Summary: The DSA requires platforms to implement measures to detect and remove illegal goods, services, and content from their services. Platforms must also provide users with clear mechanisms to report illegal goods and services. They must act expeditiously to remove or disable access to such content. The DSA's provisions are designed to protect consumers and ensure that online marketplaces operate in a safe and lawful manner. It is intended to ensure that sellers and service providers on their platforms are properly identified and comply with applicable laws and regulations.
Cosine Similarity: 0.9970
Semantic Similarity: 0.9904


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


Question: How does the DSA support the rights of consumers in the digital marketplace?
Summary: The DSA's consumer protection provisions are designed to create a safe and transparent digital marketplace. Online platforms must disclose information about the identity of sellers, the terms and conditions of transactions, and the nature of the goods and services offered. Consumers must also be informed about their rights, including the right to withdraw from a transaction, the right of a refund, and to access effective dispute resolution mechanisms.       .   for confidential support call the Samaritans on 08457 90 90 90, visit a local Samaritans branch or click here for details.
Cosine Similarity: 0.9574
Semantic Similarity: 0.8163


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)


Question: How does the DSA handle the issue of online harassment and abuse?
Summary: The DSA requires platforms to implement measures to combat online harassment and abuse. Platforms must act swiftly to remove or disable access to content that constitutes harassment or abuse, and they must provide support to victims. The DSA also encourages platforms to collaborate with law enforcement and civil society organizations to address online harassment. The measures are intended to protect users from harm and to promote a respectful and inclusive digital space for all users. It is also intended to develop best practices for creating a safe online environment.
Cosine Similarity: 0.9941
Semantic Similarity: 0.9908


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


Question: How does the DSA ensure that users have control over their data and privacy?
Summary: The DSA requires platforms to provide clear and accessible information about how user data is collected, processed and used. Platforms must implement privacy by design and privacy-by-default principles. They must also provide users with tools to manage their privacy settings and to control the use of their data for targeted advertising. The DSA enhances user control over data and privacy by requiring platforms to protect users' privacy from the outset and to inform them about their rights to access, rectify, and delete their data.
Cosine Similarity: 0.9918
Semantic Similarity: 0.8917


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


Question: How does the DSA address the issue of algorithmic transparency and accountability?
Summary: The DSA requires platforms to provide transparency about how their algorithms work. Platforms must explain the logic behind their algorithms and provide users with options to control how algorithms affect their online experience. They must also conduct regular audits of their algorithms to assess their impact on users and society. The audits must be conducted by independent third parties and evaluate whether the algorithms are fair, non-discriminatory, and aligned with fundamental rights. The DSA also requires VLOPs to explain the criteria used for content recommendation, ranking, and removal.
Cosine Similarity: 0.9961
Semantic Similarity: 0.9776


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Your max_length is set to 350, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Question: What are the requirements for online platforms to cooperate with regulatory authorities under the DSA?
Summary: The DSA requires online platforms to cooperate with regulatory authorities. Platforms must respond promptly to requests from authorities and facilitate inspections and investigations. They must also provide transparency reports and undergo independent audits to demonstrate compliance with the regulation. Cooperation with authorities is essential for ensuring that platforms meet their obligations and that the DSA's provisions are effectively enforced, as stated in DSA regulations.    .   and the platforms must provide access to data, records, and information necessary for monitoring and enforcement purposes. 
Cosine Similarity: 0.9952
Semantic Similarity: 0.9889


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Question: How does the DSA promote the development of codes of conduct for online platforms?
Summary: The DSA encourages the development of codes of conduct for online platforms to address issues such as content moderation, algorithmic transparency, and the protection of minors. The DSA promotes the adoption of these voluntary measures to ensure that platforms operate in a responsible and ethical manner. The codes provide a framework for best practices and help platforms to align their operations with the DSA's objectives. They also allow for flexibility and innovation. The code was developed in collaboration with industry stakeholders, civil society organizations, and regulatory authorities.
Cosine Similarity: 0.9921
Semantic Similarity: 0.9828

Average Cosine Similarity for GDPR: 0.9880
Average Semantic Similarity for GDPR: 0.9659

Average Cosine Similarity for AI_ACT: 0.9874
Average Semantic Similarity for AI_ACT: 0.9744

Average Cosine Similarity for DMA: 0.9879
Average Semantic Si

In [17]:
import time

# Function to measure query latency and calculate averages
def measure_and_average_query_latency(laws_info, embedding_model, top_k=1):
    latency_results = {
        'gdpr': [],
        'ai_act': [],
        'dma': [],
        'dsa': []
    }
    all_latencies = []

    for law, info in laws_info.items():
        class_name = info['collection_name']
        print(f"\nMeasuring query latency for {law.upper()} collection:")
        
        for qa in info['questions_answers']:
            query = qa['question']

            # Generate query embedding
            query_embedding = embedding_model.encode([query], convert_to_tensor=False)[0]

            # Record start time
            start_time = time.time()

            # Query Weaviate to get the most relevant chunk
            results = search_weaviate(query_embedding, class_name, top_k)

            # Record end time
            end_time = time.time()

            # Calculate latency
            latency = end_time - start_time
            latency_results[law].append(latency)
            all_latencies.append(latency)

            print(f"Query: {query}")
            print(f"Latency: {latency:.4f} seconds")
            print("----\n")
    
    # Calculate and print average latency for each law
    for law in latency_results:
        if latency_results[law]:  # Check if the list is not empty
            avg_latency = sum(latency_results[law]) / len(latency_results[law])
            print(f"{law.upper()} Average Query Latency: {avg_latency:.4f} seconds")
        else:
            print(f"{law.upper()} has no recorded latencies.")

    # Calculate and print the overall average latency across all laws
    if all_latencies:
        overall_avg_latency = sum(all_latencies) / len(all_latencies)
        print(f"\nOverall Average Query Latency: {overall_avg_latency:.4f} seconds")
    else:
        print("No latencies recorded across all laws.")

# Run the latency measurement and averaging function
measure_and_average_query_latency(laws_info, embedding_model, top_k=1)


NameError: name 'embedding_model' is not defined

# working with the other 80 (csv)

In [38]:
# Function to load questions from a CSV file
def load_questions_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df['Question'].tolist()  # Extract 'Question' column as a list

# Function to generate embeddings
def generate_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()  # Use [CLS] token embedding
    return embedding.tolist()  # Convert to a list for Pinecone

# Function to query Pinecone for the most relevant chunk
def query_pinecone(question, index, namespace, top_k=3):
    # Generate embedding for the question
    question_embedding = generate_bert_embedding(question, text_tokenizer, embedding_model)
    
    # Query Pinecone
    results = index.query(
        vector=question_embedding,
        top_k=top_k,
        namespace=namespace,
        include_metadata=True
    )
    
    # Display results
    if results and "matches" in results:
        for match in results["matches"]:
            print(f"Score: {match['score']:.4f}")
            print(f"Text: {match['metadata'].get('text', 'No text found')}\n")
    else:
        print("No matches found.")

# Test querying GDPR namespace
question = "What are the key principles of data protection?"
query_pinecone(question, index, namespace="gdpr", top_k=3)

# Main function to process CSV and retrieve relevant chunks
def process_csv_and_retrieve_chunks(file_path, namespace):
    # Step 1: Load questions from CSV
    questions = load_questions_from_csv(file_path)

    print(f"Processing questions from {file_path} in namespace '{namespace}'...\n")

    # Step 2: Process each question
    for i, question in enumerate(questions):
        print(f"Question {i + 1}: {question}")
        
        # Generate question embedding
        question_embedding = generate_bert_embedding(question, tokenizer, model)
        
        # Query Pinecone
        results = query_pinecone(question_embedding, index, namespace, top_k=1)
        
        if results:
            print(f"Top Match for Question {i + 1}:")
            for match in results:
                print(f"  - Score: {match['score']:.4f}")
                # Safely handle cases where 'metadata' is None
                if match['metadata'] is not None:
                    print(f"  - Text: {match['metadata'].get('text', 'N/A')}")
                else:
                    print("  - Metadata: None")
        else:
            print("  No relevant matches found.\n")
        print("----\n")


# Example Usage
# Provide the CSV file paths and corresponding namespaces
csv_files_and_namespaces = {
    "gdpr": "/kaggle/input/english-dataset/gdpr_test_data (1) (1).csv",
    "ai_act": "/kaggle/input/english-dataset/ai_test_data (1) (1).csv",
    "dma": "/kaggle/input/english-dataset/digital_marketing_test_data (1) (1).csv",
    "dsa": "/kaggle/input/english-dataset/digital_services_test_data (1) (1).csv"
}

for namespace, csv_file in csv_files_and_namespaces.items():
    process_csv_and_retrieve_chunks(csv_file, namespace)

Score: 0.8729
Text: (157) By coupling information from registries, researchers can obtain new knowledge of great value with regard to widespread medical conditions such as cardiovascular disease, cancer and depression. On the basis of registries, research results can be enhanced, as they draw on a larger population. Within social science, research on the basis of registries enables researchers to obtain essential knowledge about the long-term correlation of a number of social conditions such as unemployment and education with other life conditions. Research results obtained through registries provide solid, high-quality knowledge which can provide the basis for the formulation and implementation of knowledge-based policy, improve the quality of life for a number of people and improve the efficiency of social services. In order to facilitate scientific research, personal data can be processed for scientific research purposes, subject to appropriate conditions and safeguards set out in U

IndexError: index out of range in self

In [45]:
# Function to upsert chunks into Pinecone with metadata
def upsert_chunks_to_pinecone(index, namespace, chunks):
    vectors = []
    for i, chunk in enumerate(chunks):
        embedding = generate_bert_embedding(chunk, text_tokenizer, embedding_model)
        vectors.append({
            "id": f"{namespace}_id_{i}",
            "values": embedding,
            "metadata": {"text": chunk}  # Attach text as metadata
        })
    index.upsert(vectors=vectors, namespace=namespace)
    print(f"Upserted {len(vectors)} vectors to namespace '{namespace}'.")
# Debug metadata
for match in results["matches"]:
    if match['metadata'] is None:
        print(f"No metadata found for vector ID: {match['id']}")
    else:
        print(f"Retrieved text: {match['metadata'].get('text', 'No text found')}")


Retrieved text: Data protection by design ensures privacy safeguards are built into systems.
Retrieved text: The supervisory authority monitors compliance with GDPR provisions.


In [43]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pinecone

summarizer = pipeline("summarization", model="t5-base", device=0)

# Load the BERT tokenizer and model for cosine similarity
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=0)

# Function to load questions and answers from CSV
def load_questions_answers_from_csv(file_path):
    df = pd.read_csv(file_path)
    qa_pairs = [{'question': row['Question'], 'answer': row['Answer']} for _, row in df.iterrows()]
    return qa_pairs

# Load data for laws
laws_info = {
    'gdpr': {
        'questions_answers': load_questions_answers_from_csv('/kaggle/input/english-dataset/gdpr_test_data (1) (1).csv')
    },
    'ai_act': {
        'questions_answers': load_questions_answers_from_csv('/kaggle/input/english-dataset/ai_test_data (1) (1).csv')
    },
    'dma': {
        'questions_answers': load_questions_answers_from_csv('/kaggle/input/english-dataset/digital_marketing_test_data (1) (1).csv')
    },
    'dsa': {
        'questions_answers': load_questions_answers_from_csv('/kaggle/input/english-dataset/digital_services_test_data (1) (1).csv')
    }
}

# Generate BERT embedding
def generate_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()  # CLS token
    return embedding.tolist()  # Convert to Python list


# Summarize text
def summarize_text(text, max_length=350, min_length=100):
    if not isinstance(text, str) or not text.strip():
        print(f"Invalid text for summarization: {text}")
        return None
    try:
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Summarization failed: {e}")
        return None


# Query Pinecone
def query_pinecone(question, index, namespace, top_k=3):
    # Generate embedding for the question
    question_embedding = generate_bert_embedding(question, text_tokenizer, embedding_model)
    
    # Query Pinecone
    results = index.query(
        vector=question_embedding,
        top_k=top_k,
        namespace=namespace,
        include_metadata=True
    )
    
    # Display results
    if results and "matches" in results:
        for match in results["matches"]:
            print(f"Score: {match['score']:.4f}")
            print(f"Text: {match['metadata'].get('text', 'No text found')}\n")
    else:
        print("No matches found.")

# Test querying GDPR namespace
question = "What are the key principles of data protection?"
query_pinecone(question, index, namespace="gdpr", top_k=3)


# Calculate cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Calculate semantic similarity
def calculate_semantic_similarity(reference_text, summary_text, model):
    embeddings = model.encode([reference_text, summary_text])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# Embed, summarize, and compare
def embed_summarize_and_compare_all_laws(laws_info, index, top_k=1):
    similarities = {law: {'cosine': [], 'semantic': []} for law in laws_info}

    for law, info in laws_info.items():
        print(f"\nProcessing {law.upper()} collection:")

        for qa in info['questions_answers']:
            question = qa.get('question', "")
            reference_answer = qa.get('answer', "")

            # Skip invalid questions or answers
            if not isinstance(question, str) or not question.strip():
                print(f"Skipping invalid question: {question}")
                continue
            if not isinstance(reference_answer, str) or not reference_answer.strip():
                print(f"Skipping invalid reference answer: {reference_answer}")
                continue

            # Generate question embedding
            question_embedding = generate_bert_embedding(question, tokenizer, model)

            # Retrieve relevant chunks from Pinecone
            results = query_pinecone(question, index, namespace=law, top_k=top_k)
            if results:
                retrieved_text = results[0]['metadata'].get('text', None)
                if not retrieved_text:
                    print(f"No text found in metadata for question: {question}")
                    continue

                # Summarize the retrieved chunk
                summary = summarize_text(retrieved_text)
                if not summary:
                    print(f"Failed to summarize retrieved text for question: {question}")
                    continue

                # Generate embeddings for comparison
                reference_embedding = generate_bert_embedding(reference_answer, tokenizer, model)
                summary_embedding = generate_bert_embedding(summary, tokenizer, model)

                # Calculate similarities
                cosine_sim = calculate_cosine_similarity(reference_embedding, summary_embedding)
                semantic_sim = calculate_semantic_similarity(reference_answer, summary, sbert_model)

                # Store similarities
                similarities[law]['cosine'].append(cosine_sim)
                similarities[law]['semantic'].append(semantic_sim)

                # Print results
                print(f"Question: {question}")
                print(f"Summary: {summary}")
                print(f"Cosine Similarity: {cosine_sim:.4f}")
                print(f"Semantic Similarity: {semantic_sim:.4f}")
                print("----\n")
            else:
                print(f"No relevant chunks found for question: {question} in {law.upper()}.")

    return similarities


# Calculate and print averages
def calculate_and_print_averages(similarities):
    print("\nCalculated Averages:")
    for law, similarity_data in similarities.items():
        if similarity_data['cosine']:
            avg_cosine = sum(similarity_data['cosine']) / len(similarity_data['cosine'])
            print(f"{law.upper()} Average Cosine Similarity: {avg_cosine:.4f}")
        else:
            print(f"No valid cosine similarities found for {law.upper()}.")

        if similarity_data['semantic']:
            avg_semantic = sum(similarity_data['semantic']) / len(similarity_data['semantic'])
            print(f"{law.upper()} Average Semantic Similarity: {avg_semantic:.4f}")
        else:
            print(f"No valid semantic similarities found for {law.upper()}.")

# Example Run
similarities = embed_summarize_and_compare_all_laws(laws_info, index, top_k=1)
calculate_and_print_averages(similarities)



Score: 0.8729
Text: (157) By coupling information from registries, researchers can obtain new knowledge of great value with regard to widespread medical conditions such as cardiovascular disease, cancer and depression. On the basis of registries, research results can be enhanced, as they draw on a larger population. Within social science, research on the basis of registries enables researchers to obtain essential knowledge about the long-term correlation of a number of social conditions such as unemployment and education with other life conditions. Research results obtained through registries provide solid, high-quality knowledge which can provide the basis for the formulation and implementation of knowledge-based policy, improve the quality of life for a number of people and improve the efficiency of social services. In order to facilitate scientific research, personal data can be processed for scientific research purposes, subject to appropriate conditions and safeguards set out in U

# Load and reading the data from the AI_act

# Load and reading the data from the DMA

# Load and reading the data from the DSA

# Splitting the text in chunks using hierarchical chunking based on headers of html and create embeddings from the chunks of GDPR

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/italian-gdpr/Italian_gdpr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks = []
for section in sections:
    all_chunks.extend(chunk_text_based_on_tokens(section))


model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings = model_norm.embed_documents(all_chunks)


print(f"Number of chunks: {len(all_chunks)}")
print(f"Sample Embedding: {embeddings[0]}")


for i, chunk in enumerate(all_chunks[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


# Splitting the text in chunks using hierarchical chunking based on headers of html and create embeddings from the chunks of AI_act

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/italian-gdpr/Italian_gdpr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks = []
for section in sections:
    all_chunks.extend(chunk_text_based_on_tokens(section))


model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings = model_norm.embed_documents(all_chunks)


print(f"Number of chunks: {len(all_chunks)}")
print(f"Sample Embedding: {embeddings[0]}")


for i, chunk in enumerate(all_chunks[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


# Splitting the text in chunks using hierarchical chunking based on headers of html and create embeddings from the chunks of DMA

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/italian-gdpr/Italian_gdpr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks = []
for section in sections:
    all_chunks.extend(chunk_text_based_on_tokens(section))


model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings = model_norm.embed_documents(all_chunks)


print(f"Number of chunks: {len(all_chunks)}")
print(f"Sample Embedding: {embeddings[0]}")


for i, chunk in enumerate(all_chunks[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


# Splitting the text in chunks using hierarchical chunking based on headers of html and create embeddings from the chunks of DSA

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/italian-gdpr/Italian_gdpr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks = []
for section in sections:
    all_chunks.extend(chunk_text_based_on_tokens(section))


model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings = model_norm.embed_documents(all_chunks)


print(f"Number of chunks: {len(all_chunks)}")
print(f"Sample Embedding: {embeddings[0]}")


for i, chunk in enumerate(all_chunks[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


the same with bge 768 dimensions

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=1150):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/italian-gdpr/Italian_gdpr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks_768 = []
for section in sections:
    all_chunks_768.extend(chunk_text_based_on_tokens(section))


model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings_768 = model_norm.embed_documents(all_chunks_768)


print(f"Number of chunks: {len(all_chunks_768)}")
print(f"Sample Embedding: {embeddings_768[0]}")


for i, chunk in enumerate(all_chunks_768[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


In [None]:
embeddings_768[8]

In [None]:

if len(embeddings) >= 85:
    chunk_85_embedding = embeddings[100]
    print(f"Embedding for chunk 85: {chunk_85_embedding}")
else:
    print(f"Expected at least 85 chunks, but got {len(embeddings)}")






Working with chroma db as my vector database to store the embeddings and then later retrieve them

In [None]:
!pip install chromadb

In [None]:
import chromadb
chroma_client = chromadb.Client()

do the load collection, instead of delete it, also load the embeddings in chroma db somewhere

In [None]:
collection_name = "embeddings_gdpr_collection_ivf_cosine"


try:
    chroma_client.delete_collection(name=collection_name)
    print(f"Collection {collection_name} deleted successfully.")
except Exception as e:
    print(f"Error deleting collection: {e}")


try:
    collection = chroma_client.create_collection(name=collection_name)
    print(f"Collection {collection_name} created successfully.")
except Exception as e:
    print(f"Error creating collection: {e}")


In [None]:
for i, embedding in enumerate(embeddings):
    collection.add(
        documents=[all_chunks[i]],
        ids=[f"id_{i}"],
        embeddings=[embedding]
    )

cretae embeddings for my query so i can compare it later with the other embeddings stored inside the chrom db

In [None]:
def embed_query(query, model_name):
    query_embedding = model_name.embed_documents([query])
    return query_embedding[0]

In [None]:
def query_chroma_db(query_embedding, collection, top_k=10):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results

Setting the question and getting an answer _(the answer is just raw retrieved from the vector db not from an llm)


In [None]:
user_query = "Quale specifico atto dell'Unione disciplina il trattamento dei dati personali da parte delle autorità competenti per fini di prevenzione, indagine, accertamento e perseguimento di reati o esecuzione di sanzioni penali?"
query_embedding = embed_query(user_query, model_norm)

results = query_chroma_db(query_embedding, collection, top_k=1)

# Combine the retrieved documents into a single context
retrieved_context = " ".join([" ".join(doc) if isinstance(doc, list) else doc for doc in results['documents']])

for result in results['documents']:
    print(result)


qdrant vector database_gdpr

In [None]:
KyBs9rmK8ST2WXJXA5-3AYkLpOeq3mkkcBF53l6RGWN9kdJavYzrzQ

In [None]:
!pip install qdrant-client

In [None]:
from qdrant_client import QdrantClient

# Initialize Qdrant client with URL and API key
qdrant_client = QdrantClient(
    url="https://bab7324d-c9d1-4a0a-a8e7-905ee028b571.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="hp5BRHchXUfGajedmjkSROQB-7nndLTvRJRf1AmxaKN6u4EP1O3wwA"
)

# Print existing collections
print(qdrant_client.get_collections())


In [None]:
from qdrant_client import QdrantClient, models

# Delete the existing collection if it exists
collection_name = "chunk_embeddings"
try:
    qdrant_client.delete_collection(collection_name=collection_name)
    print(f"Collection {collection_name} deleted successfully.")
except Exception as e:
    print(f"Error deleting collection: {e}")

# Create a new collection with the given name
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=len(embeddings[0]), distance=models.Distance.COSINE),
)

# Prepare the points to be inserted
points = []
for idx, (chunk, embedding) in enumerate(zip(all_chunks, embeddings)):
    point = models.PointStruct(id=idx, vector=embedding, payload={"chunk": chunk})
    points.append(point)

# Insert the points into the collection
qdrant_client.upsert(collection_name=collection_name, points=points)


In [None]:
user_query = "Quale specifico atto dell'Unione disciplina il trattamento dei dati personali da parte delle autorità competenti per fini di prevenzione, indagine, accertamento e perseguimento di reati o esecuzione di sanzioni penali?"

# Generate embedding for the query
query_embedding = model_norm.embed_documents([user_query])[0]

# Perform the search with the query embedding in Qdrant
search_results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_embedding,
    limit=5  # Number of closest points to retrieve
)


In [None]:
# Process the search results
retrieved_contexts = []
for result in search_results:
    chunk = result.payload['chunk'] if 'chunk' in result.payload else "No chunk found"
    retrieved_contexts.append(chunk)
    print(f"ID: {result.id}, Score: {result.score}, Chunk: {chunk}")

# Combine the retrieved documents into a single context
retrieved_context = " ".join(retrieved_contexts)
print(f"Retrieved Context: {retrieved_context}")


pinecone database rror

70412735-1b72-47e7-bdea-0b838d28750f
us-east-1

In [None]:
!pip install pinecone-client


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the model and tokenizer
model_name = "dbmdz/bert-base-italian-xxl-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Generate embeddings for a sample input
sample_text = "Questo è un testo di esempio."
inputs = tokenizer(sample_text, return_tensors='pt', truncation=True, padding=True)
with torch.no_grad():
    outputs = model(**inputs)

# The shape of the last hidden state
embedding_shape = outputs.last_hidden_state.shape
embedding_dimension = embedding_shape[-1]

print(f"Embedding dimension: {embedding_dimension}")


In [None]:
import pinecone

# Initialize Pinecone with your details
api_key = "70412735-1b72-47e7-bdea-0b838d28750f"  # Replace with your actual Pinecone API key
environment = "us-east-1"  # Replace with your Pinecone environment

# Create an instance of the Pinecone client
pc = pinecone.Pinecone(api_key=api_key)

# Define the index name and dimension
index_name = "chunk-embeddings-index"
dimension = 768  # Ensure this matches the dimension of your embeddings

# Delete the existing index if it exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)


In [None]:
# Create the index with the correct dimension
pc.create_index(
    name=index_name, 
    dimension=dimension, 
    metric='cosine', 
    spec=pinecone.ServerlessSpec(cloud='aws', region='us-east-1')
)

# Connect to the new index
index = pc.Index(index_name)


In [None]:
# Ensure embeddings and all_chunks are already defined
points = []
for idx, (chunk, embedding) in enumerate(zip(all_chunks, embeddings)):
    point = {
        "id": str(idx),
        "values": embedding,
        "metadata": {"chunk": chunk}
    }
    points.append(point)

# Upsert points into the Pinecone index
index.upsert(vectors=points)
print(f"Successfully upserted {len(points)} points into index '{index_name}'.")


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the model and tokenizer for the Italian language
model_name = "dbmdz/bert-base-italian-xxl-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to generate embeddings
def generate_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embeddings

# Generate and normalize the embedding for the user query
user_query = "Quale specifico atto dell'Unione disciplina il trattamento dei dati personali da parte delle autorità competenti per fini di prevenzione, indagine, accertamento e perseguimento di reati o esecuzione di sanzioni penali?"
user_query_embedding = generate_embedding(user_query, model, tokenizer)

# Perform a query in Pinecone
k = 5  # Number of nearest neighbors to retrieve
results = index.query(
    vector=user_query_embedding.tolist(), 
    top_k=k, 
    include_values=True, 
    include_metadata=True
)

# Display results
for match in results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}, Chunk: {match['metadata']['chunk']}")


elastic search

In [None]:
!pip install elasticsearch torch transformers tqdm


In [None]:
Z1ljcUxaRUI4M0diaC02VHVLODM6QXlDN1A5cTlSeXFrWjFKa2ItZUhtZw==
https://6ebd858e9e464039a1f0921f6eeea823.us-central1.gcp.cloud.es.io:443



In [None]:
from elasticsearch import Elasticsearch

# Replace these with your actual Cloud ID, username, and password
cloud_id = "ca7ae35902e64e87ab231c43e81f00aa:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvJDZlYmQ4NThlOWU0NjQwMzlhMWYwOTIxZjZlZWVhODIzJGZhMjk3YTIwYmUyNTRhNjNhMTM5YTNjYjYxMmJlOGQ5"
es_username = "elastic"
es_password = "F1tXvRlBaVoZgrZWiw9HKh7x"

# Connect to the Elasticsearch instance using Cloud ID and credentials
es = Elasticsearch(
    cloud_id=cloud_id,
    basic_auth=(es_username, es_password)
)

# Test the connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

In [None]:
from tqdm import tqdm

# Name of the index you created
index_name = 'gdpr_italian'

# Assuming 'embeddings' is a list of vectors and 'all_chunks' is a list of text chunks
for i, (chunk, embedding) in tqdm(enumerate(zip(all_chunks, embeddings))):
    document = {
        "chunk": chunk,
        "embedding": embedding
    }
    es.index(index=index_name, id=i, body=document)

print("Data indexed successfully.")


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "dbmdz/bert-base-italian-xxl-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
    return embeddings

query_text = "Quale specifico atto dell'Unione disciplina il trattamento dei dati personali da parte delle autorità competenti per fini di prevenzione, indagine, accertamento e perseguimento di reati o esecuzione di sanzioni penali?"
query_embedding = encode_text(query_text)


In [None]:
search_query = {
    "size": 5,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding}
            }
        }
    }
}

response = es.search(index=index_name, body=search_query)
results = response['hits']['hits']

for result in results:
    print(result['_source']['chunk'])


milvus db vector db

In [None]:
!pip install pymilvus==2.4.3


In [None]:
from pymilvus import MilvusClient

# Initialize Milvus client with the provided URI and token
client = MilvusClient(uri="https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com", token="dde62198674a71bbbf3f9886f4c146382d5f1683699b9418fe909f22a78e33ccb26ea6eb4490d0ab51154ac66c5f764f5e381d6e")

# Describe the existing collection
collection_name = "gdpr.it"
collection_info = client.describe_collection(collection_name=collection_name)
print(collection_info)


In [None]:
ENDPOINT = "https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com"
TOKEN = "dde62198674a71bbbf3f9886f4c146382d5f1683699b9418fe909f22a78e33ccb26ea6eb4490d0ab51154ac66c5f764f5e381d6e"


In [None]:
from pymilvus import connections

ENDPOINT="https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com"
TOKEN = "dde62198674a71bbbf3f9886f4c146382d5f1683699b9418fe909f22a78e33ccb26ea6eb4490d0ab51154ac66c5f764f5e381d6e"

connections.connect(
   uri=ENDPOINT,
   token=TOKEN)

In [None]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection

# Replace with your Zilliz Cloud instance details
uri = "https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com"
token = "92203da45ce7de07d914861864d56644f25b6ec1288246c70ba41c31c9619068cf92c2513c3a1b03d5f37c68cc6162317522ad5c"

# Connect to the Milvus instance
connections.connect("default", uri=uri, token=token)


trial by  documentation

In [None]:
!git clone https://github.com/zilliztech/cloud-vectordb-examples.git


In [None]:
%cd cloud-vectordb-examples/python


In [None]:
!pip install pymilvus==2.4.3


In [None]:
!git clone https://github.com/zilliztech/cloud-vectordb-examples.git
%cd cloud-vectordb-examples/python



In [None]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Initialize Milvus connection
connections.connect(
    alias="default", 
    uri="https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com", 
    token="dde62198674a71bbbf3f9886f4c146382d5f1683699b9418fe909f22a78e33ccb26ea6eb4490d0ab51154ac66c5f764f5e381d6e"
)


In [None]:
from pymilvus import Collection, connections, FieldSchema, CollectionSchema, DataType, utility

# Connect to Milvus
connections.connect("default", uri=uri, token=token)

# Define the schema with the current max_length (5000)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=6000),  # Current max_length
    FieldSchema(name="subject", dtype=DataType.VARCHAR, max_length=20)
]
schema = CollectionSchema(fields, "Embeddings collection")

# Check for existing collections and delete if necessary
existing_collections = utility.list_collections()
print(f"Existing collections: {existing_collections}")

collection_name = "embeddings_collection"

if collection_name in existing_collections:
    # Delete the existing collection
    print(f"Deleting collection: {collection_name}")
    collection = Collection(name=collection_name)
    collection.drop()
    print(f"Collection {collection_name} deleted.")

# Create a new collection
collection = Collection(name=collection_name, schema=schema)
print("New collection created.")

# Ensure embeddings_768 and all_chunks_768 have the same length
assert len(embeddings_768) == len(all_chunks_768), "The number of embeddings must match the number of text chunks."

# Truncate text entries that exceed the max_length
max_text_length = 5000
truncated_chunks = [text[:max_text_length] for text in all_chunks_768]

# Prepare the data for insertion
data = [
    {
        "vector": embeddings_768[i],
        "text": truncated_chunks[i],
        "subject": "subject"
    }
    for i in range(len(embeddings_768))
]

# Insert the data into the collection in chunks to avoid hitting limits
chunk_size = 100
for i in range(0, len(data), chunk_size):
    collection.insert(data[i:i + chunk_size])
    print(f"Inserted batch {i//chunk_size + 1}")

# Flush to ensure all data is written
collection.flush()

# Create an index on the vector field
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
collection.create_index("vector", index_params)
print("Index created successfully")

# Load the collection to memory
collection.load()
print("Collection loaded successfully")


In [None]:
import numpy as np

def normalize_embeddings(embeddings):
    norm_embeddings = []
    for emb in embeddings:
        norm = np.linalg.norm(emb)
        if norm == 0:
            norm_embeddings.append(emb)
        else:
            norm_embeddings.append(emb / norm)
    return norm_embeddings

# Normalize your embeddings
normalized_embeddings_768 = normalize_embeddings(embeddings_768)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

# Function to generate embeddings
def generate_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy().tolist()[0]

# Define the query text
query = "What legal actions can a natural or legal person take if they are directly and individually concerned by a decision of the Board, according to Article 263 TFEU, and what are the limitations of the right to an effective judicial remedy against supervisory authority decisions?"

# Encode the query text
query_embedding = generate_embedding(query, tokenizer, model)

uri = "https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com"  # Replace with your actual Zilliz Cloud URI
token = "abdd71f92a975d6e0947c438255e43160ff3e6b293d993adfbe0320cb64f918035d163b88388fea579c83345f4ab5a801385c67b"

# Ensure to disconnect existing connections
if connections.has_connection("default"):
    connections.disconnect("default")

# Connect to Milvus on Zilliz Cloud
connections.connect("default", uri=uri, token=token)

# Define the collection name
collection_name = "embeddings_collection"

# Check if the collection exists
if collection_name in utility.list_collections():
    collection = Collection(name=collection_name)
else:
    # Define the schema for the collection
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=6000)  # Adjust max_length as needed
    ]
    schema = CollectionSchema(fields, "embeddings collection")

    # Create the collection
    collection = Collection(name=collection_name, schema=schema)

search_params = {"metric_type": "L2", "params": {"nprobe": 10}}

# Perform the similarity search
results = collection.search(
    data=[query_embedding],
    anns_field="vector",
    param=search_params,
    limit=1,
    output_fields=['text']
)

# Print the search results
for result in results[0]:
    print(f"ID: {result.id}, Text: {result.entity.get('text')}, Distance: {result.distance}")


In [None]:
import json

# Check if embeddings_768 and all_chunks variables are already defined
try:
    len(embeddings_768)
    len(all_chunks)
except NameError:
    raise ValueError("Ensure embeddings_768 and all_chunks are loaded into these variables.")

# Ensure the lengths match
assert len(embeddings_768) == len(all_chunks_768), "The lengths of embeddings_768 and all_chunks must be equal."

# Prepare the data in the required format
data = [{"vector": embeddings_768[i], "text": all_chunks[i], "subject": "example_subject"} for i in range(len(embeddings_768))]

# Define the file name
file_name = "formatted_embeddings_768.json"

# Write the formatted data to the file
with open(file_name, 'w') as f:
    json.dump(data, f)

# Check if the file is saved correctly
with open(file_name, 'r') as f:
    data = json.load(f)
    print(f"Number of embeddings saved: {len(data)}")

# Output the file path for download
file_path = f"./{file_name}"
print(f"File saved at: {file_path}")


In [None]:
import json

# Assuming embeddings_768 is already defined in your notebook
formatted_data = [{"vector": embedding} for embedding in embeddings_768]

with open('/kaggle/working/formatted_embeddings_768.json', 'w') as f:
    json.dump(formatted_data, f)

print("File saved at: /kaggle/working/formatted_embeddings_768.json")


In [None]:
import requests
import json

# Ensure embeddings and chunks have the same length
assert len(embeddings_768) == len(all_chunks_768), "The number of embeddings must match the number of text chunks."

# Prepare the data for insertion
data = [
    {
        "vector": embeddings_768[i],
        "text": all_chunks_768[i],
        "subject": "subject"
    }
    for i in range(len(embeddings_768))
]

# The endpoint and token for your Zilliz Cloud instance
base_url = "https://in03-175f9dee0a08214.api.gcp-us-west1.zillizcloud.com"
endpoint_path = "/api/v1/entities"  # Adjust this path as necessary based on Zilliz Cloud documentation
token = "92203da45ce7de07d914861864d56644f25b6ec1288246c70ba41c31c9619068cf92c2513c3a1b03d5f37c68cc6162317522ad5c"

# The request payload
payload = {
    "collectionName": "demo_collection",
    "data": data
}

# Headers for the request
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

# Construct the full URL
url = f"{base_url}{endpoint_path}"

# Make the request to insert data
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Print the response
print(response.status_code)
print(response.json())


In [None]:
!git clone https://github.com/zilliztech/cloud-vectordb-examples.git

In [None]:
!pip3 install pymilvus==2.4.3

without normalization

In [None]:
# Example user query embedding (make sure to generate this using your model)
user_query_embedding = np.array([query_embedding]).astype('float32')

# Search the index
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(user_query_embedding, k)

# Display results
for i, idx in enumerate(indices[0]):
    print(f"ID: {idx}, Distance: {distances[0][i]}, Chunk: {all_chunks[idx]}")


END OF 1ST SECTION (CHUNK EMBEDDINGS VECTORD QUERY-ANSWER)

paliiiii

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceBgeEmbeddings



nltk.download('punkt')


tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz')

def chunk_text_based_on_tokens(text, max_tokens=700):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length <= max_tokens:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:

            chunks.append(" ".join(current_chunk))

            current_chunk = [sentence]
            current_length = sentence_length


    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections


with open('/kaggle/input/gdpr-chromdb/gdprrr.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')


sections = extract_sections_articles_chapters(soup)


all_chunks_0 = []
for section in sections:
    all_chunks_0.extend(chunk_text_based_on_tokens(section))


model_name = "BAAI/bge-large-en"
encode_kwargs = {'normalize_embeddings': True}

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


embeddings_0 = model_norm.embed_documents(all_chunks_0)


print(f"Number of chunks: {len(all_chunks_0)}")
print(f"Sample Embedding: {embeddings_0[0]}")


for i, chunk in enumerate(all_chunks_0[:]):
    print(f"Chunk {i+1}:\n{chunk}\n")


In [None]:
collection_name_100_chunks = "embeddings_gdpr_for100"


try:
    chroma_client.delete_collection(name=collection_name_100_chunks)
    print(f"Collection {collection_name_100_chunks} deleted successfully.")
except Exception as e:
    print(f"Error deleting collection: {e}")


try:
    collection_for_100_chunks = chroma_client.create_collection(name=collection_name_100_chunks)
    print(f"collection_for_100_chunks {collection_name_100_chunks} created successfully.")
except Exception as e:
    print(f"Error creating collection_for_100_chunks: {e}")


In [None]:
for i, embedding in enumerate(embeddings_0):
    collection_name_100_chunks.add(
        documents=[all_chunks_0[i]],
        ids=[f"id_{i}"],
        embeddings=[embedding]
    )
def embed_query(query, model_name):
    query_embedding = model_name.embed_documents([query])
    return query_embedding[0]
def query_chroma_db(query_embedding, collection, top_k=10):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results


trial from github

In [None]:
!pip install streamlit PyPDF2 langchain google-generativeai faiss-cpu transformers


In [None]:
!pip install transformers streamlit requests

In [None]:
# Create and write the Hugging Face API key to the .env file
with open('.env', 'w') as f:
    f.write('HUGGINGFACE_API_KEY="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"')


In [None]:
import os
from dotenv import load_dotenv

# Load the API key from the .env file
load_dotenv()
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")

# Verify if the API key is loaded correctly
if not hf_api_key:
    raise ValueError("Hugging Face API key is missing. Please set it in the .env file.")

print("API key loaded successfully!")


In [None]:
import json
import pickle

# Assuming all_chunks_0 and embeddings_0 are generated and exist as variables in your notebook

# Save chunks to a JSON file
with open('all_chunks_0.json', 'w') as f:
    json.dump(all_chunks_0, f)

# Save embeddings to a pickle file
with open('embeddings_0.pkl', 'wb') as f:
    pickle.dump(embeddings_0, f)

print("Chunks and embeddings saved successfully!")


In [None]:
import json
import pickle

# Load your chunks from the JSON file
with open('all_chunks_0.json', 'r') as f:
    all_chunks_0 = json.load(f)

# Load your embeddings from the pickle file
with open('embeddings_0.pkl', 'rb') as f:
    embeddings_0 = pickle.load(f)

# Verify the loaded data
print(f"Loaded {len(all_chunks_0)} chunks.")
print(f"Loaded {len(embeddings_0)} embeddings.")


In [None]:
import torch
from transformers import pipeline

# Function to get a question generation model from Hugging Face and use GPU if available
def get_question_generation_model():
    model_name = "valhalla/t5-base-e2e-qg"
    device = 0 if torch.cuda.is_available() else -1
    question_generator = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
    return question_generator

# Function to generate questions for each chunk using Hugging Face API
def generate_questions_for_chunk(chunk, question_generator):
    # Generate questions using the model
    generated = question_generator(chunk, max_length=50, num_return_sequences=1)
    questions = [output['generated_text'] for output in generated]
    return questions

# Initialize the question generation model
question_generator = get_question_generation_model()

# Process chunks in smaller batches
batch_size = 10  # Process 10 chunks at a time
all_questions = []

for start_idx in range(0, len(all_chunks_0), batch_size):
    end_idx = start_idx + batch_size
    chunk_batch = all_chunks_0[start_idx:end_idx]
    for chunk in chunk_batch:
        questions = generate_questions_for_chunk(chunk, question_generator)
        all_questions.extend(questions)
    print(f"Processed chunks {start_idx + 1} to {end_idx}")

# Print the first 20 questions
print("First 20 questions generated:")
for i, question in enumerate(all_questions[:20]):
    print(f"Question {i+1}: {question}")


trial inside trial

In [None]:
import torch
from transformers import pipeline

# Function to get a question generation model from Hugging Face and use GPU if available
def get_question_generation_model():
    model_name = "valhalla/t5-base-e2e-qg"
    device = 0 if torch.cuda.is_available() else -1
    question_generator = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
    return question_generator

# Initialize the question generation model
question_generator = get_question_generation_model()

print(f"Question generation model initialized successfully on {'GPU' if torch.cuda.is_available() else 'CPU'}!")


In [None]:
# Function to generate questions for each chunk using Hugging Face API
def generate_questions_for_chunk(chunk, question_generator):
    # Ensure the chunk length is within the model's maximum length
    max_input_length = 512
    inputs = chunk[:max_input_length]
    # Generate questions using the model
    generated = question_generator(inputs, max_length=50, num_return_sequences=1)
    # Extract only the first question before the <sep> separator
    first_question = generated[0]['generated_text'].split('<sep>')[0].strip() if generated else None
    return first_question

# Initialize the question generation model
question_generator = get_question_generation_model()

# Process chunks in smaller batches
batch_size = 10  # Process 10 chunks at a time
all_questions = []

for start_idx in range(0, len(all_chunks_0), batch_size):
    end_idx = start_idx + batch_size
    chunk_batch = all_chunks_0[start_idx:end_idx]
    for chunk in chunk_batch:
        question = generate_questions_for_chunk(chunk, question_generator)
        if question:
            all_questions.append(question)
    print(f"Processed chunks {start_idx + 1} to {end_idx}")

# Print the first 20 questions
print("First 20 questions generated:")
for i, question in enumerate(all_questions[:20]):
    print(f"Question {i+1}: {question}")

RAGAS TRIAL

ragas and gans for questions(wuth openai)

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
client = chromadb.Client()

In [None]:
collection_100_questions = client.get_collection(name=collection_name_100_chunks)

collection_name_100_chunks = 'embeddings_gdpr_for100'  # Replace with your actual collection name
collection_100_questions = client.get_collection(name=collection_name_100_chunks)


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
question_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)



In [None]:
import numpy as np

# Ensure the dimension is 1024
assert np.array(sample_embedding).shape[0] == 1024, "Embedding dimension does not match the expected 1024 dimensions."


In [None]:

# Fetch all chunks from the collection
chunks_1 = collection_100_questions.get()["documents"]
num_chunks = len(chunks_1)
print(f"Number of chunks: {num_chunks}")


# Generate questions and update the ChromaDB collection
for i, chunk in enumerate(chunks_1):
    # Generate a question based on the chunk
    question = question_generator(chunk, max_length=100, num_return_sequences=1)[0]['generated_text']
    print(question)
    question_embedding = embedding_model.encode(question).tolist()
    print(f"Sample question_embedding dimension: {np.array(question_embedding).shape[0]}")
    # Update the collection with the generated question
    collection_100_questions.update(ids=[str(i)], documents=[chunk], metadatas=[{"question": question}])
    print("i am here")
    # Print the question for verification
    print(f"Chunk {i + 1} Question: {question}")

print("Questions generated and stored in ChromaDB collection_100_questions successfully.")

In [None]:
user_query = "What types of powers should the supervisory authorities have in each Member State under this Regulation?"
query_embedding = embed_query(user_query, model_norm)

results = query_chroma_db(query_embedding, collection, top_k=1)

# Combine the retrieved documents into a single context
retrieved_context = " ".join([" ".join(doc) if isinstance(doc, list) else doc for doc in results['documents']])

for result in results['documents']:
    print(result)

In [None]:
questions = []
for idx in selected_chunks_indices:
    chunk = all_chunks[idx]
    question = generate_question(chunk, max_input_length=256, max_output_length=50)
    questions.append((idx, chunk, question))

# Print the first 5 questions to check the output
for i in range(5):
    idx, chunk, question = questions[i]
    print(f"Chunk {idx+1}:\n{chunk}\nQuestion: {question}\n")

# Function to create embeddings
def create_embeddings(text_list, tokenizer, model, device):
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# Create embeddings for each chunk and question
chunks_embeddings_for100 = create_embeddings([chunk for _, chunk, _ in questions], embedding_tokenizer, embedding_model, device)
question_embeddings_for100 = create_embeddings([question for _, _, question in questions], embedding_tokenizer, embedding_model, device)

# Initialize ChromaDB client and create collection
chroma_client = chromadb.Client()

collection_name = "embeddings_gdpr_italian_80chunks_100questions"
try:
    chroma_client.delete_collection(name=collection_name)
except Exception as e:
    print(f"Error deleting collection: {e}")

collection = chroma_client.create_collection(name=collection_name)

# Add chunk embeddings to the collection
for i, (chunk, embedding) in enumerate(zip([chunk for _, chunk, _ in questions], chunks_embeddings_for100)):
    try:
        collection.add(
            documents=[chunk],
            ids=[f"chunk_{i+1}"],
            embeddings=embedding.tolist()  # Ensure it's converted to a list
        )
    except Exception as e:
        print(f"Error adding chunk {i+1}: {e}")

# Add question embeddings to the collection
for i, question_embedding in enumerate(question_embeddings_for100):
    try:
        _, chunk, question = questions[i]
        collection.add(
            documents=[question],
            ids=[f"question_{i+1}"],
            embeddings=question_embedding.tolist()  # Ensure it's converted to a list
        )
    except Exception as e:
        print(f"Error adding question {i+1}: {e}")

In [None]:
# Function to embed query
def embed_query(query, tokenizer, model, device):
    query_embedding = create_embeddings([query], tokenizer, model, device)
    return query_embedding[0]

# Function to query ChromaDB
def query_chroma_db(query_embedding, collection, top_k=5):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results

# Function to generate an answer
def generate_answer(context, query, gen_model, gen_tokenizer, device):
    input_text = f"Context: {context}\n\nGiven the above context, please answer the following question:\n\n{query}\n\nAnswer:"
    inputs = gen_tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    outputs = gen_model.generate(inputs['input_ids'], max_length=500, num_beams=5, early_stopping=True)
    answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Functions to evaluate cosine and semantic similarity
def evaluate_cosine_similarity(reference_answer, generated_answer):
    vectorizer = TfidfVectorizer().fit_transform([reference_answer, generated_answer])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]

def evaluate_semantic_similarity(reference_answer, generated_answer):
    inputs = embedding_tokenizer(reference_answer, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    reference_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    
    inputs = embedding_tokenizer(generated_answer, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    generated_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    
    similarity = torch.nn.functional.cosine_similarity(reference_embedding, generated_embedding).item()
    return similarity

# Evaluate all questions
cosine_similarities = []
semantic_similarities = []

for idx, chunk, question in questions:
    print(f"Processing question {idx+1}/100")
    try:
        # Embed the query
        query_embedding = embed_query(question, embedding_tokenizer, embedding_model, device)
        
        # Query the collection
        results = query_chroma_db(query_embedding, collection, top_k=1)
        
        # Combine the retrieved documents into a single context
        retrieved_context = " ".join([" ".join(doc) if isinstance(doc, list) else doc for doc

In [None]:
# Function to embed query
def embed_query(query, model_name):
    query_embedding = model_name.embed_documents([query])
    return query_embedding[0]

# Function to query ChromaDB
def query_chroma_db(query_embedding, collection, top_k=5):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results

# Function to generate an answer
def generate_answer(context, query, gen_model, gen_tokenizer):
    input_text = f"Context: {context}\n\nGiven the above context, please answer the following question:\n\n{query}\n\nAnswer:"
    inputs = gen_tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    outputs = gen_model.generate(inputs['input_ids'], max_length=500, num_beams=5, early_stopping=True)
    answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Functions to evaluate cosine and semantic similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_cosine_similarity(reference_answer, generated_answer):
    vectorizer = TfidfVectorizer().fit_transform([reference_answer, generated_answer])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]

def evaluate_semantic_similarity(reference_answer, generated_answer):
    inputs = embedding_tokenizer(reference_answer, return_tensors='pt', truncation=True, padding=True)
    reference_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)

    inputs = embedding_tokenizer(generated_answer, return_tensors='pt', truncation=True, padding=True)
    generated_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1)

    similarity = torch.nn.functional.cosine_similarity(reference_embedding, generated_embedding).item()
    return similarity

# Evaluate all questions
cosine_similarities = []
semantic_similarities = []

for idx, chunk, question in questions:
    try:
        print(f"Processing question {idx+1}/{len(questions)}")

        # Embed the query
        query_embedding = embed_query(question, model_norm)

        # Query the collection
        results = query_chroma_db(query_embedding, collection, top_k=1)

        # Combine the retrieved documents into a single context
        retrieved_context = " ".join([" ".join(doc) if isinstance(doc, list) else doc for doc in results['documents']])

        # Generate the answer based on the retrieved context
        generated_answer = generate_answer(retrieved_context, question, gen_model, gen_tokenizer)

        # Evaluate similarities
        cosine_sim = evaluate_cosine_similarity(chunk, generated_answer)
        semantic_sim = evaluate_semantic_similarity(chunk, generated_answer)

        cosine_similarities.append(cosine_sim)
        semantic_similarities.append(semantic_sim)

    except Exception as e:
        print(f"Error processing question {idx+1}: {e}")

# Print the average similarities
average_cosine_similarity = np.mean(cosine_similarities)
average_semantic_similarity = np.mean(semantic_similarities)

print(f"Average Cosine Similarity: {average_cosine_similarity:.4f}")
print(f"Average Semantic Similarity: {average_semantic_similarity:.4f}")


Using TF-IDF tocheck the answer  (METRICS)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_chunks)
query_tfidf = tfidf_vectorizer.transform([user_query])


tfidf_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
top_tfidf_indices = tfidf_scores.argsort()[-50:][::-1]
top_tfidf_documents = [all_chunks[i] for i in top_tfidf_indices]


model = SentenceTransformer('BAAI/bge-large-en')
query_embedding = model.encode(user_query)
document_embeddings = model.encode(top_tfidf_documents)

cosine_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
top_indices = cosine_scores.argsort()[-10:][::-1]


top_documents = [top_tfidf_documents[i] for i in top_indices]

most_relevant_document = top_documents[0]

print("\nGenerated Answer:")
print(most_relevant_document)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import nltk
nltk.download('punkt')

def evaluate_cosine_similarity(reference_answer, generated_answer):
    vectorizer = TfidfVectorizer().fit_transform([reference_answer, generated_answer])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]


def evaluate_semantic_similarity(reference_answer, generated_answer):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    model = AutoModel.from_pretrained('bert-base-uncased')
    
    inputs = tokenizer(reference_answer, return_tensors='pt', truncation=True, padding=True)
    reference_embedding = model(**inputs).last_hidden_state.mean(dim=1)
    
    inputs = tokenizer(generated_answer, return_tensors='pt', truncation=True, padding=True)
    generated_embedding = model(**inputs).last_hidden_state.mean(dim=1)
    
    similarity = torch.nn.functional.cosine_similarity(reference_embedding, generated_embedding).item()
    return similarity


reference_answer = """		
	
Any natural or legal person has the right to bring an action for annulment of decisions of the Board before the Court of Justice under the conditions provided for in Article 263 TFEU. As addressees of such decisions, the supervisory authorities concerned which wish to challenge them have to bring action within two months of being notified of them, in accordance with Article 263 TFEU. Where decisions of the Board are of direct and individual concern to a controller, processor or complainant, the latter may bring an action for annulment against those decisions within two months of their publication on the website of the Board, in accordance with Article 263 TFEU. Without prejudice to this right under Article 263 TFEU, each natural or legal person should have an effective judicial remedy before the competent national court against a decision of a supervisory authority which produces legal effects concerning that person. Such a decision concerns in particular the exercise of investigative, corrective and authorisation powers by the supervisory authority or the dismissal or rejection of complaints. However, the right to an effective judicial remedy does not encompass measures taken by supervisory authorities which are not legally binding, such as opinions issued by or advice provided by the supervisory authority. Proceedings against a supervisory authority should be brought before the courts of the Member State where the supervisory authority is established and should be conducted in accordance with that Member State's procedural law. Those courts should exercise full jurisdiction, which should include jurisdiction to examine all questions of fact and law relevant to the dispute before them.

Where a complaint has been rejected or dismissed by a supervisory authority, the complainant may bring proceedings before the courts in the same Member State. In the context of judicial remedies relating to the application of this Regulation, national courts which consider a decision on the question necessary to enable them to give judgment, may, or in the case provided for in Article 267 TFEU, must, request the Court of Justice to give a preliminary ruling on the interpretation of Union law, including this Regulation. Furthermore, where a decision of a supervisory authority implementing a decision of the Board is challenged before a national court and the validity of the decision of the Board is at issue, that national court does not have the power to declare the Board's decision invalid but must refer the question of validity to the Court of Justice in accordance with Article 267 TFEU as interpreted by the Court of Justice, where it considers the decision invalid. However, a national court may not refer a question on the validity of the decision of the Board at the request of a natural or legal person which had the opportunity to bring an action for annulment of that decision, in particular if it was directly and individually concerned by that decision, but had not done so within the period laid down in Article 263 TFEU.
"""
generated_answer = """
"As addressees of such decisions, the supervisory authorities concerned which wish to challenge them have to bring action within two months of being notified of them, in accordance with Article\xa0263 TFEU. Where decisions of the Board are of direct and individual concern to a controller, processor or complainant, the latter may bring an action for annulment against those decisions within two months of their publication on the website of the Board, in accordance with Article\xa0263\xa0TFEU. Without prejudice to this right under Article\xa0263\xa0TFEU, each natural or legal person should have an effective judicial remedy before the competent national court against a decision of a supervisory authority which produces legal effects concerning that person. Such a decision concerns in particular the exercise of investigative, corrective and authorisation powers by the supervisory authority or the dismissal or rejection of complaints. However, the right to an effective judicial remedy does not encompass measures taken by supervisory authorities which are not legally binding, such as opinions issued by or advice provided by the supervisory authority. Proceedings against a supervisory authority should be brought before the courts of the Member State where the supervisory authority is established and should be conducted in accordance with that Member\xa0State's procedural law. Those courts should exercise full jurisdiction, which should include jurisdiction to examine all questions of fact and law relevant to the dispute before them.
"""
cosine_sim = evaluate_cosine_similarity(reference_answer, generated_answer)
print(f"Cosine Similarity: {cosine_sim:.4f}")

semantic_similarity = evaluate_semantic_similarity(reference_answer, generated_answer)
print(f"Semantic Similarity: {semantic_similarity:.4f}")


lets see the cosine similarity and semantic similarity between the generated aswer and our query(I take the first(long)text as the generated aswer)

In [None]:
!pip install scikit-learn
!pip install transformers


START OF WEVIATE

In [None]:
!pip install weaviate-client --upgrade


In [None]:
import os
import weaviate
from weaviate.auth import AuthApiKey
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.embeddings import HuggingFaceBgeEmbeddings
import numpy as np

In [None]:
cluster_url = "https://4b69vejurbynh5hzoffizw.c0.europe-west3.gcp.weaviate.cloud"
auth_api_key = "xEKmLO1SSHhUXQjeTmPsUG3uKHMA6wh8mCEs"
openai_api_key = "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"
client = weaviate.Client(
    url=cluster_url,
    auth_client_secret=AuthApiKey(api_key=auth_api_key),
    additional_headers={
        "X-OpenAI-Api-Key": openai_api_key  
    }
)

In [None]:
try:
    client.schema.delete_class("GDPR_Chunk")
    print("Deleted existing class GDPR_Chunk.")
except Exception as e:
    print(f"Class 'GDPR_Chunk' may not exist or couldn't be deleted: {e}")


In [None]:
schema = {
    "classes": [
        {
            "class": "GDPR_Chunk",
            "description": "A class to store GDPR text chunks with embeddings",
            "vectorizer": "none", 
            "properties": [
                {
                    "name": "text",
                    "dataType": ["text"],
                    "description": "The text chunk"
                },
                {
                    "name": "embedding",
                    "dataType": ["number[]"],
                    "description": "The embedding of the text chunk"
                }
            ]
        }
    ]
} 

try:
    client.schema.create(schema)
    print("Schema created successfully.")
except Exception as e:
    print(f"Schema creation error: {e}")


In [None]:
all_chunks = all_chunks
model_name = "dbmdz/bert-base-italian-xxl-cased"
encode_kwargs = {'normalize_embeddings': True}
model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)
embeddings = model_norm.embed_documents(all_chunks)

for i, (text_chunk, embedding) in enumerate(zip(all_chunks, embeddings)):
    data_object = {
        "text": text_chunk,
        "embedding": embedding
    }
    try:
        client.data_object.create(
            data_object=data_object,
            class_name="GDPR_Chunk",
            vector=embedding  
        )
    except Exception as e:
        print(f"Failed to add object {i}: {e}")

In [None]:
tokenizer_bge = AutoTokenizer.from_pretrained('dbmdz/bert-base-italian-xxl-cased')
model_bge = AutoModel.from_pretrained('dbmdz/bert-base-italian-xxl-cased')

def generate_query_embedding(query_text):
    inputs = tokenizer_bge(query_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model_bge(**inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return query_embedding

def search_weaviate(query_embedding, client, top_k=5):
    result = client.query.get("GDPR_Chunk", ["text", "_additional {distance}"]) \
        .with_near_vector({"vector": query_embedding}) \
        .with_limit(top_k) \
        .do()
    
    print("Raw response from Weaviate:", result)
    
    if 'data' in result and 'Get' in result['data'] and 'GDPR_Chunk' in result['data']['Get']:
        return [res['text'] for res in result['data']['Get']['GDPR_Chunk']]
    else:
        return []
    

In [None]:
query_text = "Come dovrebbero gli Stati membri conciliare le norme sulla libertà di espressione e di informazione con il diritto alla protezione dei dati personali ai sensi del presente regolamento?"
query_embedding = generate_query_embedding(query_text)

search_results = search_weaviate(query_embedding, client)

print(f"Search results: {search_results}") 




In [None]:
reference_answer = """		
Gli Stati membri dovrebbero prevedere garanzie adeguate per il trattamento di dati personali per finalità di archiviazione nel pubblico interesse, per finalità di ricerca scientifica o storica o per finalità statistiche. Gli Stati membri dovrebbero essere autorizzati a fornire, a specifiche condizioni e fatte salve adeguate garanzie per gli interessati, specifiche e deroghe relative ai requisiti in materia di informazione e ai diritti alla rettifica, alla cancellazione, all'oblio, alla limitazione del trattamento, alla portabilità dei dati personali, nonché al diritto di opporsi in caso di trattamento di dati personali per finalità di archiviazione nel pubblico interesse, per finalità di ricerca scientifica o storica o per finalità statistiche.", "Per accertare la ragionevole probabilità di utilizzo dei mezzi per identificare la persona fisica, si dovrebbe prendere in considerazione l'insieme dei fattori obiettivi, tra cui i costi e il tempo necessario per l'identificazione, tenendo conto sia delle tecnologie disponibili al momento del trattamento, sia degli sviluppi tecnologici. I principi di protezione dei dati non dovrebbero pertanto applicarsi a informazioni anonime, vale a dire informazioni che non si riferiscono a una persona fisica identificata o identificabile o a dati personali resi sufficientemente anonimi da impedire o da non consentire più l'identificazione dell'interessato. Il presente regolamento non si applica pertanto al trattamento di tali informazioni anonime, anche per finalità statistiche o di ricerca. (27) Il presente regolamento non si applica ai dati personali delle persone decedute. Gli Stati membri possono prevedere norme riguardanti il trattamento dei dati personali delle persone decedute.", "La direttiva 2003/98/CE del Parlamento europeo e del Consiglio\xa0(14) non pregiudica in alcun modo il livello di tutela delle persone fisiche con riguardo al trattamento dei dati personali ai sensi delle disposizioni di diritto dell'Unione e degli Stati membri e non modifica, in particolare, gli obblighi e i diritti previsti dal presente regolamento. Nello specifico, tale direttiva non dovrebbe applicarsi ai documenti il cui accesso è escluso o limitato in virtù dei regimi di accesso per motivi di protezione dei dati personali, e a parti di documenti accessibili in virtù di tali regimi che contengono dati personali il cui riutilizzo è stato previsto per legge come incompatibile con la normativa in materia di tutela delle persone fisiche con riguardo al trattamento dei dati personali.", "Le autorità pubbliche o gli organismi pubblici o privati che tengono registri di interesse pubblico dovrebbero essere servizi che, in virtù del diritto dell'Unione o degli Stati membri, hanno l'obbligo legale di acquisire, conservare, valutare, organizzare, descrivere, comunicare, promuovere, diffondere e fornire accesso a registri con un valore a lungo termine per l'interesse pubblico generale. Gli Stati membri dovrebbero inoltre essere autorizzati a prevedere il trattamento ulteriore dei dati personali per finalità di archiviazione, per esempio al fine di fornire specifiche informazioni connesse al comportamento politico sotto precedenti regimi statali totalitari, a genocidi, crimini contro l'umanità, in particolare l'Olocausto, o crimini di guerra. (159) Qualora i dati personali siano trattati per finalità di ricerca scientifica, il presente regolamento dovrebbe applicarsi anche a tale trattamento.", "Tali meccanismi comprendono verifiche sulla protezione dei dati e metodi per assicurare provvedimenti correttivi intesi a proteggere i diritti dell'interessato."""
generated_answer = """
Gli Stati membri dovrebbero prevedere garanzie adeguate per il trattamento di dati personali per finalità di archiviazione nel pubblico interesse, per finalità di ricerca scientifica o storica o per finalità statistiche. Gli Stati membri dovrebbero essere autorizzati a fornire, a specifiche condizioni e fatte salve adeguate garanzie per gli interessati, specifiche e deroghe relative ai requisiti in materia di informazione e ai diritti alla rettifica, alla cancellazione, all'oblio, alla limitazione del trattamento, alla portabilità dei dati personali, nonché al diritto di opporsi in caso di trattamento di dati personali per finalità di archiviazione nel pubblico interesse, per finalità di ricerca scientifica o storica o per finalità statistiche.", "Per accertare la ragionevole probabilità di utilizzo dei mezzi per identificare la persona fisica, si dovrebbe prendere in considerazione l'insieme dei fattori obiettivi, tra cui i costi e il tempo necessario per l'identificazione, tenendo conto sia delle tecnologie disponibili al momento del trattamento, sia degli sviluppi tecnologici. I principi di protezione dei dati non dovrebbero pertanto applicarsi a informazioni anonime, vale a dire informazioni che non si riferiscono a una persona fisica identificata o identificabile o a dati personali resi sufficientemente anonimi da impedire o da non consentire più l'identificazione dell'interessato. Il presente regolamento non si applica pertanto al trattamento di tali informazioni anonime, anche per finalità statistiche o di ricerca. (27) Il presente regolamento non si applica ai dati personali delle persone decedute. Gli Stati membri possono prevedere norme riguardanti il trattamento dei dati personali delle persone decedute.", "La direttiva 2003/98/CE del Parlamento europeo e del Consiglio\xa0(14) non pregiudica in alcun modo il livello di tutela delle persone fisiche con riguardo al trattamento dei dati personali ai sensi delle disposizioni di diritto dell'Unione e degli Stati membri e non modifica, in particolare, gli obblighi e i diritti previsti dal presente regolamento. Nello specifico, tale direttiva non dovrebbe applicarsi ai documenti il cui accesso è escluso o limitato in virtù dei regimi di accesso per motivi di protezione dei dati personali, e a parti di documenti accessibili in virtù di tali regimi che contengono dati personali il cui riutilizzo è stato previsto per legge come incompatibile con la normativa in materia di tutela delle persone fisiche con riguardo al trattamento dei dati personali.", "Le autorità pubbliche o gli organismi pubblici o privati che tengono registri di interesse pubblico dovrebbero essere servizi che, in virtù del diritto dell'Unione o degli Stati membri, hanno l'obbligo legale di acquisire, conservare, valutare, organizzare, descrivere, comunicare, promuovere, diffondere e fornire accesso a registri con un valore a lungo termine per l'interesse pubblico generale. Gli Stati membri dovrebbero inoltre essere autorizzati a prevedere il trattamento ulteriore dei dati personali per finalità di archiviazione, per esempio al fine di fornire specifiche informazioni connesse al comportamento politico sotto precedenti regimi statali totalitari, a genocidi, crimini contro l'umanità, in particolare l'Olocausto, o crimini di guerra. (159) Qualora i dati personali siano trattati per finalità di ricerca scientifica, il presente regolamento dovrebbe applicarsi anche a tale trattamento.", "Tali meccanismi comprendono verifiche sulla protezione dei dati e metodi per assicurare provvedimenti correttivi intesi a proteggere i diritti dell'interessato."""

cosine_sim = evaluate_cosine_similarity(reference_answer, generated_answer)
print(f"Cosine Similarity: {cosine_sim:.4f}")

semantic_similarity = evaluate_semantic_similarity(reference_answer, generated_answer)
print(f"Semantic Similarity: {semantic_similarity:.4f}")

In [None]:
def verify_and_print_embeddings(client):
    result = client.query.get("GDPR_Chunk", ["text", "embedding"]).with_limit(10).do()
    if 'data' in result and 'Get' in result['data'] and 'GDPR_Chunk' in result['data']['Get']:
        for item in result['data']['Get']['GDPR_Chunk']:
            print("Text:", item['text'])
            print("Embedding (first 5 values):", np.array(item['embedding'][:5]))  # Print first 5 dimensions of the embedding for brevity
    else:
        print("No data found in class GDPR_Chunk.")

making 100

In [None]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.Client()

collection_name = "embeddings_gdpr_italian_100chunks"

# Check if the collection exists and delete it if it does
try:
    existing_collections = chroma_client.list_collections()
    if any(col.name == collection_name for col in existing_collections):
        chroma_client.delete_collection(name=collection_name)
        print(f"Collection {collection_name} deleted successfully.")
except Exception as e:
    print(f"Error checking/deleting collection: {e}")

# Create a new collection
try:
    collection = chroma_client.create_collection(name=collection_name)
    print(f"Collection {collection_name} created successfully.")
except Exception as e:
    print(f"Error creating collection: {e}")

In [None]:
import nltk
from bs4 import BeautifulSoup

nltk.download('punkt')

def chunk_text_based_on_tokens(text, max_chunks=100):
    sentences = nltk.sent_tokenize(text)
    total_sentences = len(sentences)
    chunk_size = max(1, total_sentences // max_chunks)
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, total_sentences, chunk_size)]
    return chunks[:max_chunks]  # Ensure we only have the desired number of chunks

def extract_sections_articles_chapters(soup):
    sections = []
    current_section = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section:
                sections.append(" ".join(current_section))
                current_section = []
            current_section.append(element.get_text())
        else:
            current_section.append(element.get_text())
    if current_section:
        sections.append(" ".join(current_section))
    return sections

# Path to your HTML file
html_file_path = '/kaggle/input/gdpr-chromdb/gdprrr.html'

with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

sections = extract_sections_articles_chapters(soup)

all_chunks = []
for section in sections:
    all_chunks.extend(chunk_text_based_on_tokens(section))

# Ensure we only have 100 chunks
all_chunks = all_chunks[:100]

print(f"Number of chunks: {len(all_chunks)}")

In [None]:
!pip install transformers


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load a model fine-tuned for question generation
model_name = "mrm8488/t5-base-finetuned-question-generation-ap"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available

def generate_question_finetuned(chunk, prompt="generate question:", max_input_length=512, max_output_length=50):
    input_text = f"{prompt} {chunk}"
    inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=max_input_length, truncation=True)
    inputs = inputs.to(device)
    outputs = model.generate(inputs, max_length=max_output_length, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question.strip()

# Generate 100 questions based on the chunks
questions = []
for idx, chunk in enumerate(all_chunks):
    question = generate_question_finetuned(chunk, max_input_length=256, max_output_length=100)
    questions.append((f"Question {idx+1}: {question}", f"Chunk {idx+1}: {chunk}"))

print(f"Generated {len(questions)} questions.")

# Print only the first 5 questions and their corresponding chunks
for i in range(5):
    print(f"{questions[i][0]}")
    print(f"{questions[i][1]}")
    print()


In [None]:
from transformers import AutoTokenizer, AutoModel

# Use BERT model for creating embeddings
embedding_model_name = "BAAI/bge-large-en"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_model.to(device)  # Move model to GPU if available

def create_embeddings(text_list, tokenizer, model, device):
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# Create embeddings for each chunk
chunks_embeddings = create_embeddings(all_chunks, embedding_tokenizer, embedding_model, device)

# Add chunk embeddings to the collection
for i, (chunk, embedding) in enumerate(zip(all_chunks, chunks_embeddings)):
    try:
        collection.add(
            documents=[chunk],
            ids=[f"chunk_{i+1}"],
            embeddings=[embedding.tolist()]  # Ensure it's converted to a list
        )
    except Exception as e:
        print(f"Error adding chunk {i+1}: {e}")

print("Chunk embeddings added to the collection.")


In [None]:
# Create embeddings for the questions
questions_only = [q[0].split(": ")[1] for q in questions]
questions_embeddings = create_embeddings(questions_only, embedding_tokenizer, embedding_model, device)

# Retrieve the most relevant chunk for each question from ChromaDB
def retrieve_relevant_chunk(question_embedding, collection):
    results = collection.query(
        query_embeddings=[question_embedding.tolist()],
        n_results=1
    )
    return results['documents'][0][0]

# Generate answers by retrieving the most relevant chunks for each question
answers = []
for question_embedding in questions_embeddings:
    relevant_chunk = retrieve_relevant_chunk(question_embedding, collection)
    answers.append(relevant_chunk)

print(f"Generated {len(answers)} answers.")
# Print the first 5 questions, their corresponding chunks, and the retrieved answers
for i in range(5):
    print(f"Question {i+1}: {questions[i][0]}")
    print(f"Chunk {i+1}: {questions[i][1]}")
    print(f"Answer {i+1}: {answers[i]}")
    print()



In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the BERT model and tokenizer
embedding_model_name = "BAAI/bge-large-en"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)


In [None]:
def create_embeddings(text_list, tokenizer, model, device):
    if not text_list or not all(text_list):
        raise ValueError("Text list is empty or contains None values.")
    
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings


In [None]:
# Ensure we have questions and their embeddings
questions_only = [q[0].split(": ")[1] for q in questions]
questions_embeddings = create_embeddings(questions_only, embedding_tokenizer, embedding_model, device)

# Function to retrieve the most relevant chunk for each question from ChromaDB
def retrieve_relevant_chunk(question_embedding, collection):
    results = collection.query(
        query_embeddings=[question_embedding.tolist()],
        n_results=1
    )
    return results['documents'][0][0] if results['documents'] else None

# Retrieve the most relevant chunks for each question
relevant_chunks = []
for idx, question_embedding in enumerate(questions_embeddings):
    relevant_chunk = retrieve_relevant_chunk(question_embedding, collection)
    if relevant_chunk:
        relevant_chunks.append(relevant_chunk)
    else:
        print(f"Warning: No relevant chunk found for question {idx+1}")

print(f"Retrieved {len(relevant_chunks)} relevant chunks out of {len(questions_embeddings)} questions.")

# Validate and print data before creating embeddings
chunks_only = [q[1].split(": ")[1] for q in questions if q[1].split(": ")[1]]
if not all(chunks_only):
    raise ValueError("One or more chunks are empty.")
print(f"First chunk: {chunks_only[0]}")

answers_only = [chunk for chunk in relevant_chunks if chunk]
if not all(answers_only):
    raise ValueError("One or more answers are empty.")
print(f"First answer: {answers_only[0]}")


In [None]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.Client()

collection_name = "embeddings_gdpr_italian_100chunks"

# Check if the collection exists and delete it if it does
try:
    existing_collections = chroma_client.list_collections()
    if any(col.name == collection_name for col in existing_collections):
        chroma_client.delete_collection(name=collection_name)
        print(f"Collection {collection_name} deleted successfully.")
except Exception as e:
    print(f"Error checking/deleting collection: {e}")

# Create a new collection
try:
    collection = chroma_client.create_collection(name=collection_name)
    print(f"Collection {collection_name} created successfully.")
except Exception as e:
    pri


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the BERT model and tokenizer
embedding_model_name = "bert-base-uncased"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)

# Function to create embeddings
def create_embeddings(text_list, tokenizer, model, device):
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move tensors to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# Create embeddings for the chunks
chunks_only = [q[1].split(": ")[1] for q in questions]
chunks_embeddings = create_embeddings(chunks_only, embedding_tokenizer, embedding_model, device)

# Add chunk embeddings to the collection
for i, (chunk, embedding) in enumerate(zip(chunks_only, chunks_embeddings)):
    try:
        collection.add(
            documents=[chunk],
            ids=[f"chunk_{i+1}"],
            embeddings=[embedding.tolist()]  # Ensure it's converted to a list
        )
    except Exception as e:
        print(f"Error adding chunk {i+1}: {e}")

print("Chunk embeddings added to the collection.")


In [None]:
# Create embeddings for the questions
questions_only = [q[0].split(": ")[1] for q in questions]
questions_embeddings = create_embeddings(questions_only, embedding_tokenizer, embedding_model, device)

# Function to retrieve the most relevant chunk for each question from ChromaDB
def retrieve_relevant_chunk(question_embedding, collection):
    results = collection.query(
        query_embeddings=[question_embedding.tolist()],
        n_results=1
    )
    return results['documents'][0][0] if results['documents'] else None

# Retrieve the most relevant chunks for each question
relevant_chunks = []
for idx, question_embedding in enumerate(questions_embeddings):
    relevant_chunk = retrieve_relevant_chunk(question_embedding, collection)
    if relevant_chunk:
        relevant_chunks.append(relevant_chunk)
    else:
        print(f"Warning: No relevant chunk found for question {idx+1}")

print(f"Retrieved {len(relevant_chunks)} relevant chunks out of {len(questions_embeddings)} questions.")

# Validate and print data before creating embeddings
chunks_only = [q[1].split(": ")[1] for q in questions if q[1].split(": ")[1]]
if not all(chunks_only):
    raise ValueError("One or more chunks are empty.")
print(f"First chunk: {chunks_only[0]}")

answers_only = [chunk for chunk in relevant_chunks if chunk]
if not all(answers_only):
    raise ValueError("One or more answers are empty.")
print(f"First answer: {answers_only[0]}")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Create embeddings for the generated answers
answers_embeddings = create_embeddings(answers_only, embedding_tokenizer, embedding_model, device)

# Create embeddings for the reference chunks
reference_chunks_embeddings = create_embeddings(chunks_only, embedding_tokenizer, embedding_model, device)

# Calculate cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

cosine_similarities = []
semantic_similarities = []
for answer_embedding, chunk_embedding in zip(answers_embeddings, reference_chunks_embeddings):
    cosine_sim = calculate_cosine_similarity(answer_embedding, chunk_embedding)
    cosine_similarities.append(cosine_sim)
    # Semantic similarity calculation
    semantic_sim = cosine_sim  # For simplicity, using cosine similarity as semantic similarity here
    semantic_similarities.append(semantic_sim)

print("Cosine and Semantic Similarities calculated.")

# Print the first 5 results for inspection
for i in range(5):
    print(f"Question {i+1}: {questions[i][0]}")
    print(f"Chunk {i+1}: {chunks_only[i]}")
    print(f"Answer {i+1}: {answers_only[i]}")
    print(f"Cosine Similarity {i+1}: {cosine_similarities[i]:.4f}")
    print(f"Semantic Similarity {i+1}: {semantic_similarities[i]:.4f}")
    print()
