In [1]:
import pandas as pd
import minsearch
from openai import OpenAI
import os
from tqdm.auto import tqdm
import json
import random


In [8]:
client = OpenAI()

In [2]:
df = pd.read_parquet('_pmg_sample_clean.parquet.brotli')
df['responder'] = df['responder'].str.replace('to ask the ', '', regex=False)
# Convert 'date' column to the desired string format 'YYYY-MM-DD'
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# Confirm that the 'date' column is now of type 'object'
df['date'] = df['date'].astype('object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       300 non-null    object
 1   mp         300 non-null    object
 2   question   300 non-null    object
 3   answer     300 non-null    object
 4   id         300 non-null    object
 5   responder  300 non-null    object
dtypes: object(6)
memory usage: 14.2+ KB


In [3]:
documents = df.to_dict(orient='records')

In [15]:
documents[120]

{'date': '2024-10-04',
 'mp': 'Hlonyana, Ms NKF',
 'question': 'What (a) is the current status of healthcare infrastructure in under-resourced areas, especially in the rural regions and (b) measures will he and his department take to improve the condition of the facilities?',
 'answer': 'A) Under-resourced areas, in the context of the response below does not imply a lack of human resources or medical supplies but relate to health infrastructure resources. The current condition of health facilities as per the User Asset Management Plans (UAMPs) at the time can be illustrated in the Graph below:  The UAMP as prepared and issued by the various Provincial Department of Health, relates to all the immovable assets which the Department intends to use in support of its own service delivery objectives and complies with the Government Immovable Asset Management Act, 2007 (GIAMA). GIAMA provides a uniform framework for management of immovable assets by departments; ensures coordination of the use

In [6]:
prompt_template = """
You're a politcal analyst. Answer the QUESTION based on the CONTEXT from the PMG database.
Use only the facts from the CONTEXT when answering the QUESTION.

The record:

date: {date}
mp: {mp}
question: {question}
answer: {answer}
responder: {responder}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [7]:
prompt = prompt_template.format(**documents[0])


'You\'re a politcal analyst. Answer the QUESTION based on the CONTEXT from the PMG database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nThe record:\n\ndate: 2024-10-11\nmp: Bodlani, Ms T\nquestion: Whether his department has put any plans in place to (a) rationalise the 11 entities reporting to him that have overlapping mandates and (b) pursue public-private partnerships to strengthen the performance of the entities, to lower the pressure on public finances; if not, in each case, what is the position in this regard; if so, what are the relevant details in each case?\nanswer: Find replyhere\nresponder: Minister of Communications and Digital Technologies\n\nProvide the output in parsable JSON without using code blocks:\n\n{"questions": ["question1", "question2", ..., "question5"]}'

In [9]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
questions = llm(prompt)

In [11]:
json.loads(questions)


{'questions': ['Whether his department has put any plans in place to rationalise the 11 entities reporting to him that have overlapping mandates?',
  'Whether his department has put any plans in place to pursue public-private partnerships to strengthen the performance of the entities to lower the pressure on public finances?']}

In [12]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/300 [00:00<?, ?it/s]

JSONDecodeError: Expecting ',' delimiter: line 1 column 1868 (char 1867)

In [17]:
len(results)

263

In [18]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [19]:
final_results[0], final_results[30]


(('NW801',
  'Whether his department has put any plans in place to rationalise the 11 entities reporting to him that have overlapping mandates?'),
 ('NW554',
  "What timeframes have been put in place regarding the Commission's road to autonomy?"))

In [20]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])
