In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-08-22 15:52:05--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: 'minsearch.py'

     0K ...                                                   100%  391K=0.01s

2024-08-22 15:52:05 (391 KB/s) - 'minsearch.py' saved [3832/3832]



In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
documents = df.to_dict(orient='records')

In [4]:
import minsearch

In [5]:
index = minsearch.Index(
    text_fields=['глава', 'содержание', 'статья'],
    keyword_fields=['id']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x7f228e9ae090>

## RAG flow

In [7]:
from openai import OpenAI

client = OpenAI()

In [8]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [9]:
prompt_template = """
You're a law insrtuctor. Answer the QUESTION based on the CONTEXT from our law database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
глава: {глава}
содержание: {содержание}
статья: {статья}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [18]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [19]:
question = 'Какая статья говорит о штрафе за вандализм?'
answer = rag(question)
print(answer)

Штраф за вандализм указан в статье 448. В ней говорится о том, что вандализм, совершенный несовершеннолетними, влечет штраф на родителей или лиц, их заменяющих, в размере пятнадцати месячных расчетных показателей.


## Retrieval evaluation

In [20]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [21]:
df_question.head()

Unnamed: 0,id,question
0,0,Каково содержание законодательства Республики ...
1,0,На каких документах основывается Кодекс об адм...
2,0,Как осуществляется внесение изменений в настоя...
3,0,Каков приоритет международных договоров по сра...
4,0,Что представляет собой составная часть законод...


In [22]:
ground_truth = df_question.to_dict(orient='records')

In [23]:
ground_truth[0]

{'id': 0,
 'question': 'Каково содержание законодательства Республики Казахстан об административных правонарушениях?'}

In [24]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [25]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [26]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [27]:
from tqdm.auto import tqdm

In [28]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/3365 [00:00<?, ?it/s]

{'hit_rate': 0.8520059435364041, 'mrr': 0.7199194556475385}

## Finding the best parameters

In [30]:
df_validation = df_question[:1000]
df_test = df_question[1000:]

In [31]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [32]:
gt_val = df_validation.to_dict(orient='records')

In [33]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [34]:
param_ranges = {
    'глава': (0.0, 3.0),
    'содержание': (0.0, 3.0),
    'статья': (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [35]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

({'глава': 0.7182290589975411,
  'содержание': 2.673300236669002,
  'статья': 1.8473594836520246},
 0.7306821428571428)

In [36]:
def minsearch_improved(query):
    boost = {
        'глава': 0.72,
        'содержание': 2.67,
        'статья': 1.85
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/3365 [00:00<?, ?it/s]

{'hit_rate': 0.8627043090638931, 'mrr': 0.7326169249274744}

## RAG evaluation

In [37]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [38]:
len(ground_truth)

3365

In [39]:
record = ground_truth[0]


In [42]:
answer_llm = rag(question) 

'Какая статья говорит о штрафе за вандализм?'

In [43]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Какая статья говорит о штрафе за вандализм?
Generated Answer: Штраф за вандализм несовершеннолетних предусмотрен в статье 448. В соответствии с ней, вандализм, совершенный несовершеннолетними в возрасте до шестнадцати лет, влечет штраф на родителей или лиц, их заменяющих, в размере пятнадцати месячных расчетных показателей.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [44]:
import json

In [48]:
df_sample = df_question.sample(n=200, random_state=1)

In [49]:
sample = df_sample.to_dict(orient='records')

In [50]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [52]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [53]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.915
PARTLY_RELEVANT    0.070
NON_RELEVANT       0.015
Name: proportion, dtype: float64

In [62]:
df_eval.relevance.value_counts()

relevance
RELEVANT           183
PARTLY_RELEVANT     14
NON_RELEVANT         3
Name: count, dtype: int64

In [54]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [55]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
43,В соответствии с Законом РК от 28.12.17 г. № 1...,412,Какие изменения были внесены в часть 2 статьи ...,NON_RELEVANT,The generated answer does not address changes ...
51,Размеры штрафов за дискриминацию для разных ка...,94,Каковы размеры штрафов для разных категорий су...,NON_RELEVANT,The generated answer does not address the ques...
198,В контексте предоставленной информации не соде...,290,Какая редакция статьи 259 была утверждена 24 н...,NON_RELEVANT,The generated answer states that it lacks info...


In [56]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [57]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [58]:
df_eval.relevance.value_counts()

relevance
RELEVANT           185
PARTLY_RELEVANT      8
NON_RELEVANT         7
Name: count, dtype: int64

In [59]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.925
PARTLY_RELEVANT    0.040
NON_RELEVANT       0.035
Name: proportion, dtype: float64

In [60]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)