In [23]:
import pandas as pd
import minsearch
from openai import OpenAI
import os
from tqdm.auto import tqdm
import json
import random


In [32]:
client = OpenAI()

In [2]:
df = pd.read_parquet('_pmg_sample_clean.parquet.brotli')
df['responder'] = df['responder'].str.replace('to ask the ', '', regex=False)
# Convert 'date' column to the desired string format 'YYYY-MM-DD'
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# Confirm that the 'date' column is now of type 'object'
df['date'] = df['date'].astype('object')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       300 non-null    object
 1   mp         300 non-null    object
 2   question   300 non-null    object
 3   answer     300 non-null    object
 4   id         300 non-null    object
 5   responder  300 non-null    object
dtypes: object(6)
memory usage: 14.2+ KB


In [4]:
documents = df.to_dict(orient='records')

In [5]:
index = minsearch.Index(
    text_fields=['date', 'id', 'mp', 'responder', 'question', 'answer'],
    keyword_fields=['id']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x121f64170>

In [14]:
df_question = pd.read_parquet('_gt_retrieval.parquet.brotli')

In [15]:
ground_truth = df_question.to_dict(orient='records')


In [16]:
ground_truth[3]

{'id': 'NW801',
 'question': 'Can you provide details on the current status of public-private partnership initiatives for these entities?'}

In [17]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [18]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [19]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [20]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))


  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.8013333333333333, 'mrr': 0.5247925925925931}

### best parameters

In [22]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [24]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [25]:
gt_val = df_validation.to_dict(orient='records')


In [26]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [27]:
param_ranges = {
    'date': (0.0, 0.0),
    'mp': (0.0, 3.0),
    'responder': (0.0, 3.0),
    'question': (0.0, 3.0),
    'answer': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [28]:
simple_optimize(param_ranges, objective, n_iterations=5)


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'date': 0.0,
  'mp': 2.1199514848307968,
  'responder': 0.5390566740545003,
  'question': 2.4247436338922963,
  'answer': 1.761826300273443},
 0.8601666666666666)

In [29]:
def minsearch_improved(query):
    boost = {
        'date': 0,
        'mp': 2.11,
        'responder': 0.53,
        'question': 2.42,
        'answer': 1.76,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.9653333333333334, 'mrr': 0.8307584656084658}

In [33]:
query = "what steps has she taken to ensure that a culture of human rights and racial tolerance is embraced in schools?"

prompt_template = """
You're a politcal analyst. Answer the QUESTION based on the CONTEXT from the PMG database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """
date: {date}
mp: {mp}
question: {question}
answer: {answer}
responder: {responder}
""".strip()

def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={}, # might filter per ministry
        boost_dict=boost,
        num_results=5
    )

    return results



def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
        f"responder: {doc['responder']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [34]:
question = 'what steps has she taken to ensure that a culture of human rights and racial tolerance is embraced in schools?'
answer = rag(question)
print(answer)

The Minister of Justice and Constitutional Development has taken steps to promote a culture of human rights and racial tolerance in schools, particularly in response to increasing incidents of racism. Specific actions taken include the implementation of educational programs and initiatives aimed at fostering inclusivity and understanding among students. The focus is on creating an environment where human rights are respected and racial tolerance is encouraged among young learners. Further details regarding these specific programs were not provided in the available context, but the Minister appears committed to addressing these issues within the education system.


### rag evaluations

In [30]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [31]:
len(ground_truth)


1500

In [None]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

In [None]:
# pipenv run pgcli -h localhost -U your_username -d pmg_assist -W

# pipenv shell
# cd pmg-assist
# export POSTGRES_HOST=localhost
# python pmg_db_prep.py


# pipenv shell
# cd grafana
# env | grep POSTGRES_HOST
# python init.py