In [1]:
import logging
import os

os.chdir('d:/genai/genaipy-lib/')

# Logging settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# PDF SETTINGS
PDF_URL = "demos/data/aia_eu_proposal.pdf" # AI Act
START = 44
END = 45

# LLM SETTINGS
LLM = "gpt-4-1106-preview"
SYS_MESSAGE = "You are an expert in AI law and excell at communicating complex regulations and legal requirements in plain English." #TODO: refine system message for specific document
NUM_QUESTIONS = 3

In [2]:
from genaipy.extractors.pdf import extract_pages_text

pages = extract_pages_text(pdf_path=PDF_URL, start_page=START, end_page=END)
print(f"Successfully loaded text from {len(pages)} PDF pages.")

Successfully loaded text from 2 PDF pages.


In [3]:
from tqdm.notebook import tqdm

from genaipy.utilities import convert_json_to_df
from genaipy.openai_apis.chat import get_chat_response
from genaipy.prompts.build_prompt import build_prompt
from genaipy.prompts.generate_qa import GENERATE_QA_TPL

qa_dataset = []
for page in tqdm(pages, desc="Generating Q&A pairs..."):
    try:
        qa_prompt = build_prompt(template=GENERATE_QA_TPL, num=NUM_QUESTIONS, text=pages[page]["content"])
        qa_response = get_chat_response(prompt=qa_prompt,
                                   sys_message=SYS_MESSAGE,
                                   model=LLM,
                                   response_format={"type": "json_object"})
        qa_df = convert_json_to_df(qa_response)
        qa_dataset.append(qa_df)
        logging.info("Created Q&A pairs for sample #%d", page)
    except Exception as e:
        logging.error("An error occured while generating Q&A sample #%d", page)

2024-01-17 12:52:44,810 - INFO - NumExpr defaulting to 8 threads.


Generating Q&A pairs...:   0%|          | 0/2 [00:00<?, ?it/s]

2024-01-17 12:53:22,862 - INFO - Successfully completed Chat API request. Total token usage: 1136
2024-01-17 12:53:22,872 - INFO - Created Q&A pairs for sample #1
2024-01-17 12:53:59,057 - INFO - Successfully completed Chat API request. Total token usage: 1228
2024-01-17 12:53:59,059 - INFO - Created Q&A pairs for sample #2


In [4]:
import pandas as pd 

final_df = pd.concat(qa_dataset, ignore_index=True) 
final_df.head(n=5)

Unnamed: 0,question,answer
0,What actions are prohibited regarding the use ...,The text outlines several actions that are pro...
1,What powers does the Commission have with rega...,The Commission is given the power to adapt and...
2,Under what condition can real-time remote biom...,The use of real-time remote biometric identifi...
3,What are the specific instances in which law e...,Law enforcement can use real-time remote biome...
4,Before using real-time remote biometric identi...,"Before using such systems, law enforcement mus..."
5,What is the procedure for law enforcement to o...,"Generally, law enforcement must obtain prior a..."


In [8]:
from genaipy.utilities import convert_df_to_messages


messages = convert_df_to_messages(df=final_df, system_msg=SYS_MESSAGE, user_col="question", assistant_col="answer")
print(messages[0])

[{'messages': [{'role': 'system', 'content': 'You are an expert in AI law and excell at communicating complex regulations and legal requirements in simple, non-technical English.'}, {'role': 'user', 'content': 'What actions are prohibited regarding the use of AI systems according to the text?'}, {'role': 'assistant', 'content': 'The text outlines several actions that are prohibited when using AI systems. These include: (a) using AI systems that manipulate behavior by using subliminal techniques unbeknownst to the individual that could harm someone physically or psychologically; (b) using AI systems that exploit the vulnerabilities of individuals, particularly those in specific groups like those with disabilities or the elderly, to similarly distort behavior resulting in physical or psychological harm; (c) using AI for assessing or scoring individuals’ trustworthiness by public authorities, based on their social behavior or predicted personal traits, leading to negative consequences in 

In [9]:
from genaipy.utilities import write_data_to_jsonl

write_data_to_jsonl(data=messages, file_path="test.jsonl")

2024-01-17 13:05:33,642 - INFO - Data written to JSON Lines file 'test.jsonl'
