In [52]:
import google.generativeai as genai
import os
import pandas as pd
import tiktoken
import numpy as np
genai.configure(api_key=os.environ["API_KEY"])

In [18]:
def get_embeddings( title, article ):
    result = genai.embed_content(
    model="models/text-embedding-004",
    content=article,
    task_type="retrieval_document",
    title=title)
    return result['embedding']

In [3]:
df = pd.read_csv('./data/gdpr_cased_articles_with_recitals.csv')

In [4]:
df.head()

Unnamed: 0,article_id,article_title,article_text,article_recitals
0,article1,Subject-matter and objectives,This Regulation lays down rules relating to th...,1.2345678910111212e+16
1,article2,Material scope,This Regulation applies to the processing of p...,1415.0
2,article2,Material scope,This Regulation does not apply to the processi...,16.0
3,article2,Material scope,This Regulation does not apply to the processi...,
4,article2,Material scope,This Regulation does not apply to the processi...,18.0


In [None]:
get_embeddings(df['article_title'][0],df['article_text'][0])

In [15]:
print(df['article_title'][0])

Subject-matter and objectives


In [25]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [27]:
list = []
for i in range(len(df.index)):
    title, text = df['article_title'][i], df['article_text'][i]
    token_len = num_tokens_from_string(text)
    temp = [title, text, token_len]
    embedding = get_embeddings(title, text)
    temp.append(embedding)
    list.append(temp)

In [31]:
df_new = pd.DataFrame(list, columns=['article_title', 'article_text', 'article_tokens', 'article_text_embeddings'])

In [32]:
df_new.head()

Unnamed: 0,article_title,article_text,article_tokens,article_text_embeddings
0,Subject-matter and objectives,This Regulation lays down rules relating to th...,86,"[-0.07772678, 0.0103528565, 0.0082582515, -0.0..."
1,Material scope,This Regulation applies to the processing of p...,45,"[-0.05222351, 0.053586897, -0.03747398, -0.035..."
2,Material scope,This Regulation does not apply to the processi...,27,"[-0.040194023, 0.017073363, -0.057278626, -0.0..."
3,Material scope,This Regulation does not apply to the processi...,37,"[-0.043380607, 0.019432526, -0.039794073, -0.0..."
4,Material scope,This Regulation does not apply to the processi...,27,"[-0.057984274, 0.0064719967, -0.05763235, -0.0..."


In [50]:
x = pd.merge(df, df_new, how="left")

In [51]:
x.head()

Unnamed: 0,article_id,article_title,article_text,article_recitals,article_tokens,article_text_embeddings
0,article1,Subject-matter and objectives,This Regulation lays down rules relating to th...,1.2345678910111212e+16,86,"[-0.07772678, 0.0103528565, 0.0082582515, -0.0..."
1,article2,Material scope,This Regulation applies to the processing of p...,1415.0,45,"[-0.05222351, 0.053586897, -0.03747398, -0.035..."
2,article2,Material scope,This Regulation does not apply to the processi...,16.0,27,"[-0.040194023, 0.017073363, -0.057278626, -0.0..."
3,article2,Material scope,This Regulation does not apply to the processi...,,37,"[-0.043380607, 0.019432526, -0.039794073, -0.0..."
4,article2,Material scope,This Regulation does not apply to the processi...,18.0,27,"[-0.057984274, 0.0064719967, -0.05763235, -0.0..."


In [55]:
x.to_csv('./data/gdpr_cased_articles_with_recitals_embeddings.csv')

In [53]:
def get_dot_product(text_embedding, question_embedding=""):
    return np.dot(text_embedding, question_embedding)

In [95]:
question_embedding = get_embeddings("user input", "What are my rights as an Individual in EU")

In [96]:
x['distance'] = x.article_text_embeddings.apply(lambda p: get_dot_product(p, question_embedding))

In [97]:
x.sort_values(['distance'], ascending=False)

Unnamed: 0,article_id,article_title,article_text,article_recitals,article_tokens,article_text_embeddings,distance
586,article77,Right to lodge a complaint with a supervisory ...,Without prejudice to any other administrative ...,141,70,"[-0.04318639, 0.009823787, 0.0013123561, -0.02...",0.760135
164,article22,"Automated individual decision-making, includin...",The data subject shall have the right not to b...,7172,39,"[-0.04324376, 0.015624178, 0.016327191, -0.046...",0.746726
588,article78,Right to an effective judicial remedy against ...,Without prejudice to any other administrative ...,143,40,"[-0.045312826, -0.0052126944, 0.026995184, -0....",0.735505
130,article15,Right of access by the data subject,The data subject shall have the right to obtai...,6364,58,"[-0.047044102, 0.01906889, 0.0070038955, -0.04...",0.733498
366,article47,Binding corporate rules,The binding corporate rules shall specify at l...,110,98,"[-0.04559353, 0.005038787, 0.028027346, -0.033...",0.725485
...,...,...,...,...,...,...,...
31,article4,Definitions,'group of undertakings' means a controlling un...,37,18,"[-0.026804209, 0.029797085, 0.0009110155, -0.0...",0.488253
598,article81,Suspension of proceedings,Where those proceedings are pending at first i...,144,51,"[0.016459905, 0.02905598, 0.016495313, -0.0320...",0.485155
567,article73,Chair,The term of office of the Chair and of the dep...,,21,"[0.018717434, 0.05230803, 0.0047608837, -0.004...",0.479319
576,article75,Secretariat,"The secretariat shall provide analytical, admi...",140,15,"[0.036584362, 0.042329982, 0.016415294, 0.0004...",0.477369


In [68]:
x.loc[x["article_title"] == "Right to erasure ('right to be forgotten')"]

Unnamed: 0,article_id,article_title,article_text,article_recitals,article_tokens,article_text_embeddings,distance
136,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,68,"[-0.032365043, -0.008397134, 0.0020770333, -0....",0.80358
137,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,158,"[-0.033225175, -0.0029193629, -0.0044079283, -...",0.805873
138,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,89,"[-0.0455578, -0.013683858, 0.0030171564, -0.02...",0.797698
139,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,56,"[-0.042438716, -0.00021942731, -0.0029840148, ...",0.796488
140,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,73,"[-0.03386438, -0.0027421627, 0.002576739, -0.0...",0.80194
141,article17,Right to erasure ('right to be forgotten'),The data subject shall have the right to obtai...,6566,71,"[-0.05192303, -0.0018970313, -0.013146247, -0....",0.788213
142,article17,Right to erasure ('right to be forgotten'),Where the controller has made the personal dat...,6566,77,"[-0.039192215, 0.01568776, -0.027002933, -0.03...",0.778074
143,article17,Right to erasure ('right to be forgotten'),Right to erasure ('right to be forgotten') sha...,6566,32,"[-0.064188726, -0.017257614, -0.035147376, -0....",0.725918
144,article17,Right to erasure ('right to be forgotten'),Right to erasure ('right to be forgotten') sha...,6566,67,"[-0.04173252, 0.0028624088, -0.00333421, -0.01...",0.739766
145,article17,Right to erasure ('right to be forgotten'),Right to erasure ('right to be forgotten') sha...,6566,134,"[-0.028719774, 0.004937607, -0.025912542, -0.0...",0.704216


In [70]:
len(x["article_title"].unique())

97

In [72]:
%pip install --upgrade --quiet \
    "google-cloud-aiplatform[langchain,reasoningengine]" \
    cloudpickle==3.0.0 \
    pydantic==2.7.4 \
    langchain-google-community \
    google-cloud-discoveryengine \
    google-api-python-client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [128]:
google_storage_bucket="vertex-ai-hack"
google_storage_bucket_link="gs://{}".format(google_storage_bucket)
data_bucket="{}/data/".format(google_storage_bucket_link)
project_id="hackhathon-438922"

In [130]:
import vertexai
from vertexai.preview import reasoning_engines, rag
from vertexai.preview.generative_models import GenerativeModel, Tool
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file('./keys/hackhathon-438922-0a5870e658d5.json')

vertexai.init(
    project=project_id,
    location="us-central1",
    staging_bucket="gs://{}".format(google_storage_bucket),
    credentials=credentials
)

In [129]:
display_name = "eu_ai_act"
paths = [data_bucket]

In [131]:
embedding_model_config = rag.EmbeddingModelConfig(
    publisher_model="publishers/google/models/text-embedding-004"
)
rag_corpus = rag.create_corpus(
    display_name=display_name,
    embedding_model_config=embedding_model_config,
)

In [132]:
rag_corpus

RagCorpus(name='projects/617378578625/locations/us-central1/ragCorpora/2305843009213693952', display_name='eu_ai_act', description='', embedding_model_config=EmbeddingModelConfig(publisher_model='projects/hackhathon-438922/locations/us-central1/publishers/google/models/text-embedding-004', endpoint=None, model=None, model_version_id=None), vector_db=RagManagedDb())

In [133]:
rag.import_files(
    rag_corpus.name,
    paths,
    chunk_size=512,  # Optional
    chunk_overlap=100,  # Optional
    max_embedding_requests_per_min=900,  # Optional
)

imported_rag_files_count: 1

In [None]:
response = rag.retrieval_query(
    rag_resources=[
        rag.RagResource(
            rag_corpus=rag_corpus.name,
            # Optional: supply IDs from `rag.list_files()`.
            # rag_file_ids=["rag-file-1", "rag-file-2", ...],
        )
    ],
    text="Tell me about the EU AI act article 17",
    similarity_top_k=10,  # Optional
    vector_distance_threshold=0.7,  # Optional
)
print(response)

In [127]:
from google.cloud import storage

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client(project=project_id, credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )


In [120]:
upload_blob(google_storage_bucket, './data/AI_ACT_2024.pdf', 'AI_ACT_2024.pdf')

File ./data/AI_ACT_2024.pdf uploaded to AI_ACT_2024.pdf.


In [200]:
model = "gemini-1.5-flash-001"

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}
model_kwargs = {
    # temperature (float): The sampling temperature controls the degree of
    # randomness in token selection.
    "temperature": 0,
    # max_output_tokens (int): The token limit determines the maximum amount of
    # text output from one prompt.
    "max_output_tokens": 8192,
    # top_p (float): Tokens are selected from most probable to least until
    # the sum of their probabilities equals the top-p value.
    "top_p": 0.95,
    # top_k (int): The next token is selected from among the top-k most
    # probable tokens. This is not supported by all model versions. See
    # https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/image-understanding#valid_parameter_values
    # for details.
    "top_k": None,
    # safety_settings (Dict[HarmCategory, HarmBlockThreshold]): The safety
    # settings to use for generating content.
    # (you must create your safety settings using the previous step first).
    "safety_settings": safety_settings,
    "credentials": credentials
}

In [105]:
agent = reasoning_engines.LangchainAgent(
    model=model,                # Required.
    model_kwargs=model_kwargs
)

response = agent.query(input="Tell me about the EU AI act article")

In [106]:
response

{'input': 'Tell me about the EU AI act article 17',
 'output': "## EU AI Act Article 17: High-Risk AI Systems and Conformity Assessment\n\nArticle 17 of the EU AI Act focuses on **high-risk AI systems** and the **conformity assessment** process they must undergo. This article is crucial for ensuring that these systems are safe, reliable, and comply with the Act's requirements.\n\nHere's a breakdown of key points:\n\n**1. Scope:**\n\n* Article 17 applies to **all high-risk AI systems** as defined in Annex III of the Act. This includes systems used in critical areas like healthcare, transportation, and law enforcement.\n* It covers the **entire lifecycle** of the system, from design and development to deployment and post-market monitoring.\n\n**2. Conformity Assessment:**\n\n* **Providers** of high-risk AI systems must undergo a **conformity assessment** to demonstrate compliance with the Act's requirements.\n* This assessment can be conducted by **independent conformity assessment bodie

In [139]:
import PyPDF2

In [202]:
def parse_pdf(file_path):
    """
    Extracts text from a PDF document.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF document.
    """
    pdf_file = open(file_path, 'rb')
    text = []
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = pdf_reader.pages
        temp_text = ''
        for i, page_obj in enumerate(pages):
            if (num_tokens_from_string(temp_text) > 8150):
                text.append(temp_text)
                temp_text=''
            temp_text += page_obj.extract_text()
    return text

In [203]:
# Example usage:
file_path = './data/AI_ACT_2024.pdf'
extracted_text_list = parse_pdf(file_path)

In [None]:
agent = reasoning_engines.LangchainAgent(
    model=model,                # Required.
    model_kwargs=model_kwargs
)



In [206]:
refined_text = []
for extracted_text in extracted_text_list:
    prompt = "Repair the content of the text below. Make sure to not add any additional text other than what is already there as this would adulterate the content of the page. Only fix spacing and line issues. Do not hallucinate words into the content.\n\n {}".format(extracted_text)
    response = agent.query(input=prompt)
    refined_text.append(response['output'])

Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-1.5-flash. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai..
Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-1.5-flash. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai..
Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Quota exceeded fo

In [207]:
output_text = ''
for text in refined_text:
    output_text += "{}\n\n".format(text)

with open('file.txt', 'w') as f:
    f.write(output_text)

In [None]:
x = num_tokens_from_string(extracted_text)

In [None]:
print(x)

140933


In [None]:
with open('file.txt', 'w') as f:
    f.write(response['output'])

In [330]:
def get_tag_label(str):
    idx = str.find(']')
    return (str[1:idx], idx)

In [331]:
def get_tag_content(start, str):
    end_idx = str.find('[', start)
    return str[start+1:end_idx].strip()

In [482]:
CHAPTER = 'chapter'
CHAPTER_TITLE = 'chapter_title'
ARTICLE = 'article'
ARTICLE_TITLE = 'article_title'
ARTICLE_TEXT = 'article_text'
SECTION = 'section'
SECTION_TITLE = 'section_title'

In [483]:
def get_data_fields_regulations(chapters, sections, articles):
    def get_details_from_regulations(f):
        chapter = None
        section = None
        article = None
        for line in f.readlines():
            (tag, start_idx) = get_tag_label(line)
            content = get_tag_content(start_idx, line)
            if (tag == CHAPTER):
                if(section != None):
                    sections.append(section)
                if (article != None):
                    articles.append(article)
                if(chapter != None):
                    chapters.append(chapter)
                    section = None
                    article = None
                chapter = {
                    "chapter_number": content[8:],
                }
            if (tag == CHAPTER_TITLE):
                chapter["chapter_title"] = content
            if (tag == SECTION):
                if (article != None):
                    articles.append(article)
                if(section != None):
                    article = None
                    sections.append(section)
                section = {
                    "chapter_number": chapter["chapter_number"],
                    "section_number": content[8:],
                }
            if (tag == SECTION_TITLE):
                section["section_title"] = content
            if (tag == ARTICLE):
                if(article != None):
                    articles.append(article)
                article = {
                    "article_number": content[8:],
                    "section_number": section["section_number"] if section != None else None,
                    "chapter_number": chapter["chapter_number"] if chapter != None else None,
                }
            if (tag == ARTICLE_TITLE):
                article['article_title'] = content
            if (tag == ARTICLE_TEXT):
                article['article_text'] = content
    return get_details_from_regulations

In [484]:
def get_data_fields_annexes(articles):
    def get_data_from_annex(f):
        article = None
        for line in f.readlines():
            (tag, start_idx) = get_tag_label(line)
            content = get_tag_content(start_idx, line)
            if (tag == ARTICLE):
                if(article != None):
                    articles.append(article)
                article = {
                    "article_number": content[6:],
                }
            if (tag == ARTICLE_TITLE):
                article['article_title'] = content
            if (tag == ARTICLE_TEXT):
                article['article_text'] = content
    return get_data_from_annex

In [485]:
def get_data_fields_references(references):
    def get_data_from_references(f):
        for line in f.readlines():
            (tag, start_idx) = get_tag_label(line)
            content = get_tag_content(start_idx, line)
            references.append({
                "reference_number": tag,
                "reference_text": content
            })
    return get_data_from_references

In [486]:
def get_data_fields_guides(guides):
    def get_data_from_guides(f):
        for line in f.readlines():
            (tag, start_idx) = get_tag_label(line)
            content = get_tag_content(start_idx, line)
            guides.append({
                "id": tag,
                "text": content
            })
    return get_data_from_guides

In [487]:
def create_array_from_file( file_path, get_details ):

    with open(file_path, 'r', encoding='utf8') as f:
        get_details(f)
            

In [488]:
chapters = []
sections = []
articles = []
get_details_from_regulations = get_data_fields_regulations(chapters, sections, articles)
create_array_from_file('./regulation.txt', get_details_from_regulations )

In [489]:
annex_articles = []
get_details_from_annex = get_data_fields_annexes(articles=annex_articles)
create_array_from_file('./annex.txt', get_details_from_annex)

In [None]:
references = []
get_details_from_reference = get_data_fields_references(references)
create_array_from_file('./references.txt', get_details_from_reference)

In [491]:
guides = []
get_data_guides = get_data_fields_guides(guides)
create_array_from_file('./guide.txt', get_data_guides)

In [492]:
chapter_df = pd.DataFrame(chapters)
section_df = pd.DataFrame(sections)
article_df = pd.DataFrame(articles)

In [493]:
annex_articles_df = pd.DataFrame(annex_articles)

In [494]:
references_df = pd.DataFrame(references)

In [495]:
guides_df = pd.DataFrame(guides)

In [496]:
chapter_df.to_csv('ai_act_chapters.csv', index=False)
section_df.to_csv('ai_act_sections.csv', index=False)
article_df.to_csv('ai_act_articles.csv', index=False)

In [497]:
annex_articles_df.to_csv('ai_act_annex.csv', index=False)

In [498]:
references_df.to_csv('ai_act_references.csv', index=False)

In [499]:
guides_df.to_csv('ai_act_guide.csv', index=False)

In [500]:
merge_chapters_sections = pd.merge(chapter_df, section_df, how="left")

In [501]:
merge_chapter_sections_articles = pd.merge(merge_chapters_sections, article_df, how="left")

In [502]:
merge_chapter_sections_articles.head()

Unnamed: 0,chapter_number,chapter_title,section_number,section_title,article_number,article_title,article_text
0,I,GENERAL PROVISIONS,,,1,Subject matter,1. The purpose of this Regulation is to improv...
1,I,GENERAL PROVISIONS,,,2,Scope,1. This Regulation applies to:(a)providers pla...
2,I,GENERAL PROVISIONS,,,3,Definitions,"For the purposes of this Regulation, the follo..."
3,I,GENERAL PROVISIONS,,,4,AI literacy,Providers and deployers of AI systems shall ta...
4,II,PROHIBITED AI PRACTICES,,,5,Prohibited AI practices,1. The following AI practices shall be prohibi...


In [503]:
merge_chapter_sections_articles.to_csv('ai_act_regulations.csv', index=False)