In [164]:
import pandas as pd
import google.generativeai as genai
import os
import pandas as pd
import tiktoken
import numpy as np
from dotenv import load_dotenv
import vertexai
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.preview import rag,reasoning_engines
from vertexai.preview.generative_models import GenerativeModel, Tool
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
from google.oauth2 import service_account

load_dotenv()
genai.configure(api_key=os.environ["API_KEY"])

In [56]:
google_storage_bucket="vertex-ai-hack"
google_storage_bucket_link="gs://{}".format(google_storage_bucket)
data_bucket="{}/data/".format(google_storage_bucket_link)
project_id="hackhathon-438922"

In [57]:
credentials = service_account.Credentials.from_service_account_file('./keys/hackhathon-438922-0a5870e658d5.json')

vertexai.init(
    project=project_id,
    location="us-central1",
    staging_bucket="gs://{}".format(google_storage_bucket),
    credentials=credentials
)

In [97]:
def get_embeddings( title, content, task_type="retrieval_document" ):
    kwargs = {
        "model": "models/text-embedding-004",
        "task_type":"retrieval_document",
        "title":title
    }
    result = genai.embed_content(
        **kwargs,
        content=content,
    )
    
    return result['embedding']

In [60]:
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [7]:
regulations_df = pd.read_csv('./dataset/ai_act_regulations.csv')

In [None]:
regulations_df['content'] = regulations_df.apply(lambda p:)

In [45]:
list = []
for i,row in enumerate(regulations_df.iterrows()):
    temp = [regulations_df['article_number'][i], regulations_df['article_title'][i]]
    text = "Chapter: {}\n Section: {}\n Article: {}\n {}".format(regulations_df['chapter_title'][i],
                                                                                         regulations_df['section_title'][i] if regulations_df['section_title'][i] != "nan" else "",
                                                                                         regulations_df['article_title'][i],
                                                                                         regulations_df['article_text'][i])
    temp.append(text)
    list.append(temp)

In [46]:
regulations_df_articles = pd.DataFrame(list, columns=['article_number', 'article_title', 'content'])

In [119]:
regulations_df_articles.sort_values(['num_of_tokens'], ascending=False).head()

Unnamed: 0,article_number,article_title,content,num_of_tokens
2,3,Definitions,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,3314
4,5,Prohibited AI practices,Chapter: PROHIBITED AI PRACTICES\n Section: na...,2075
56,57,AI regulatory sandboxes,Chapter: MEASURES IN SUPPORT OF INNOVATION\n S...,1612
25,26,Obligations of deployers of high-risk AI systems,Chapter: HIGH-RISK AI SYSTEMS\n Section: Oblig...,1486
59,60,Testing of high-risk AI systems in real world ...,Chapter: MEASURES IN SUPPORT OF INNOVATION\n S...,1476


In [48]:
regulations_df_articles['num_of_tokens'] = regulations_df_articles.content.apply(lambda p: num_tokens_from_string(p))

In [291]:
list = []
for i in range(len(regulations_df_articles.index)):
    number, title, text = regulations_df_articles['article_number'][i], regulations_df_articles['article_title'][i], regulations_df_articles['content'][i]
    token_len = regulations_df_articles['num_of_tokens'][i]
    temp = [number, title, text, token_len]
    if (token_len > 2048 ):
        characters = token_len*4
        first_segment = 2048*4
        second_segment = characters - first_segment
        first_segment_text = text[:first_segment]
        labels = first_segment_text.split('\n')[0:3]
        labels[2] = labels[2] + " (continuation)"
        second_segment_text = text[first_segment+1:]
        second_segment_text = "{}\n{}".format('\n'.join(labels), second_segment_text)
        first_segment_tokens = num_tokens_from_string(first_segment_text)
        second_segment_tokens = num_tokens_from_string(second_segment_text)
        first_segment_embeddings = get_embeddings(title, first_segment_text)
        second_segment_embeddings = get_embeddings(title, second_segment_text)
        list.append([number, title, first_segment_text,first_segment_tokens,first_segment_embeddings])
        list.append([number, title, second_segment_text,second_segment_tokens,second_segment_embeddings])
    else:
        embedding = get_embeddings(title, text)
        temp.append(embedding)
        list.append(temp)
        
    

In [299]:
articles_embedded_df = pd.DataFrame(list, columns=['article_number', 'article_title', 'chunk_data', 'tokens', 'embeddings'])

In [300]:
articles_embedded_df['file_id'] = f"{google_storage_bucket_link}/dataset/ai_act_regulations_with_embeddings.csv"

In [301]:
articles_embedded_df.head()

Unnamed: 0,article_number,article_title,chunk_data,tokens,embeddings,file_id
0,1,Subject matter,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,220,"[-0.045846753, 0.033218704, 0.0119536845, -0.0...",gs://vertex-ai-hack/dataset/ai_act_regulations...
1,2,Scope,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,929,"[-0.00471153, 0.03327254, -0.03335749, -0.0061...",gs://vertex-ai-hack/dataset/ai_act_regulations...
2,3,Definitions,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,1646,"[-0.006802297, 0.019587621, -0.014258634, -0.0...",gs://vertex-ai-hack/dataset/ai_act_regulations...
3,3,Definitions,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,1687,"[0.013903835, 0.02562094, -0.017638592, -0.016...",gs://vertex-ai-hack/dataset/ai_act_regulations...
4,4,AI literacy,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,101,"[-0.0050897873, 0.04631556, -0.034699276, -0.0...",gs://vertex-ai-hack/dataset/ai_act_regulations...


In [302]:
articles_embedded_df.to_csv('./dataset/ai_act_regulations_with_embeddings.csv', index=False)

In [152]:
question_embedding = get_embeddings(None, "What does 'emotion recognition system' mean in the act?", "QUESTION_ANSWERING")

In [145]:
def get_dot_product(text_embedding, question_embedding=""):
    return np.dot(text_embedding, question_embedding)

In [303]:
articles_embedded_df['distance'] = articles_embedded_df.embeddings.apply(lambda p: get_dot_product(p, question_embedding))

In [304]:
articles_embedded_df.sort_values(['distance'], ascending=False).head()

Unnamed: 0,article_number,article_title,chunk_data,tokens,embeddings,file_id,distance
2,3,Definitions,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,1646,"[-0.006802297, 0.019587621, -0.014258634, -0.0...",gs://vertex-ai-hack/dataset/ai_act_regulations...,0.674218
3,3,Definitions,Chapter: GENERAL PROVISIONS\n Section: nan\n A...,1687,"[0.013903835, 0.02562094, -0.017638592, -0.016...",gs://vertex-ai-hack/dataset/ai_act_regulations...,0.661552
5,5,Prohibited AI practices,Chapter: PROHIBITED AI PRACTICES\n Section: na...,1548,"[-0.014715451, -0.00017980096, -0.03006157, -0...",gs://vertex-ai-hack/dataset/ai_act_regulations...,0.656855
51,50,Transparency obligations for providers and dep...,Chapter: TRANSPARENCY OBLIGATIONS FOR PROVIDER...,912,"[-0.011190878, 0.04778545, -0.012691616, -0.04...",gs://vertex-ai-hack/dataset/ai_act_regulations...,0.641274
6,5,Prohibited AI practices,Chapter: PROHIBITED AI PRACTICES\n Section: na...,554,"[-0.017545851, 0.026869344, -0.0019566123, -0....",gs://vertex-ai-hack/dataset/ai_act_regulations...,0.63813


In [305]:
from google.cloud import storage

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client(project=project_id, credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )


In [306]:
upload_blob(google_storage_bucket, './dataset/ai_act_regulations_with_embeddings.csv','dataset/ai_act_regulations_with_embeddings_2.csv')

File ./dataset/ai_act_regulations_with_embeddings.csv uploaded to dataset/ai_act_regulations_with_embeddings_2.csv.


In [308]:
link_to_file = f"{google_storage_bucket_link}/dataset/ai_act_regulations_with_embeddings_2.csv"

In [234]:
from google.cloud import bigquery
from google.api_core.exceptions import Conflict

dataset_id = 'hackhathon-438922.hackhathon_ai_act_rag'

client = bigquery.Client(credentials=credentials, project=project_id)

dataset = bigquery.Dataset(dataset_id)

dataset.location = "US"

# Send the dataset to the API for creation
try:
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"Created dataset {client.project}.{dataset.dataset_id}")
except Conflict:
    print(f"dataset {dataset.dataset_id } already exists")

dataset hackhathon_ai_act_rag already exists


In [314]:
final_schema = [
    bigquery.SchemaField("article_number", "INT64"),
    bigquery.SchemaField("article_title", "STRING"),
    bigquery.SchemaField("file_id", "STRING"),
    bigquery.SchemaField("tokens", "INT64"),
    bigquery.SchemaField("chunk_data", "STRING"),
    bigquery.SchemaField("embeddings", "ARRAY<FLOAT64>", mode="REPEATED"),
]

In [311]:
rag_table = 'hackhathon-438922.hackhathon_ai_act_rag.ai_act_embedded'

In [315]:
final_table = bigquery.Table(rag_table, schema=final_schema)

In [317]:
final_table = client.get_table(final_table) 

In [318]:
print(f"Created final table {project_id}.{final_table.dataset_id}.{final_table.table_id}")

Created final table hackhathon-438922.hackhathon_ai_act_rag.ai_act_embedded


In [319]:
rows_to_insert = x.to_dict(orient='records')

In [320]:
errors = client.insert_rows_json(f"{final_table.dataset_id}.{final_table.table_id}", rows_to_insert)  # API request

if errors:
    print(f"Encountered errors while inserting rows: {errors}")
else:
    print(f"Successfully loaded data into {final_table.dataset_id}.{final_table.table_id}")

Successfully loaded data into hackhathon_ai_act_rag.ai_act_embedded


In [323]:
query = f"""
WITH search_results AS (
  SELECT base.article_number AS article_number, base.article_title as article_title, base.chunk_data as chunk_data, distance
  FROM VECTOR_SEARCH(
    TABLE `{rag_table}`, 'embeddings',
    (SELECT {question_embedding} AS embeddings, 'query_vector' AS file_id),
    top_k => 2, distance_type => 'DOT_PRODUCT', options => '{{"use_brute_force": true}}'
    )
)
SELECT sr.article_number, sr.article_title, sr.distance, sr.chunk_data
FROM search_results sr
ORDER BY sr.distance ASC
"""

query_job = client.query(query)
results = query_job.result()

In [324]:
for row in results:
    print(f"Article Number: {row['article_number']}, Article Title: {row['article_title']}, distance: {row['distance']}, text: {row['chunk_data'][0:100]}")

Article Number: 3, Article Title: Definitions, distance: -0.6742176865363544, text: Chapter: GENERAL PROVISIONS
 Section: nan
 Article: Definitions
 For the purposes of this Regulation
Article Number: 3, Article Title: Definitions, distance: -0.6615519481079747, text: Chapter: GENERAL PROVISIONS
 Section: nan
 Article: Definitions (continuation)
emotions or intention
