# Ingestion 

## Load Documents with ids

In [9]:
import json
import pandas as pd

from tqdm.auto import tqdm
from openai import OpenAI
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

In [10]:
with open('notebooks/document-with-ids.json', 'r') as file:
    documents = json.load(file)

In [11]:
documents[0]

{'category': 'Manage existing benefit',
 'question': 'How do I update my benefit information?',
 'answer': 'You can update your benefit information online through your account.',
 'section': 'general claim benefits',
 'id': '30eada08-5708-5c5c-9df8-0f7d5d4dc131'}

## Load ground_truth data

In [12]:
df_ground_truth = pd.read_csv('notebooks/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [13]:
ground_truth[10]

{'question': 'Is it possible to appeal?',
 'section': 'general claim benefits',
 'document': '8d000ade-6c2b-571c-aa61-5d38eb463cf8'}

In [14]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



## Indexing Elasticsearch with Data

In [15]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "category": {"type": "text"},
            "question": {"type": "text"},
            "section": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'benefit-claims'})

In [16]:
for doc in tqdm(documents):
    question = doc['question']
    answer = doc['answer']
    doc['question_answer_vector'] = model.encode(question + ' ' + answer)

    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 425/425 [00:18<00:00, 22.61it/s]


## Retrieval

In [17]:
def elastic_search_knn(field, vector, section):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "section": section
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["answer", "section", "question", "category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_answer_vector_knn(q):
    question = q['question']
    section = q['section']

    v_q = model.encode(question)

    return elastic_search_knn('question_answer_vector', v_q, section)

In [18]:
question_answer_vector_knn(dict(
    question="Can I get sick pay if I'm self-isolating?",
    section ='general claim benefits'
))

[{'question': "Can I get sick pay if I'm self-isolating?",
  'answer': "Yes Statutory Sick Pay is available if you're self-isolating.",
  'section': 'general claim benefits',
  'id': '1de35e0b-f233-554c-84ef-fc30494e0ea0',
  'category': 'Temporarily unable to work'},
 {'question': 'How do I apply for sick pay?',
  'answer': 'You need to provide a fit note from your doctor to apply for sick pay.',
  'section': 'general claim benefits',
  'id': 'ecc42084-d88e-5bf0-8070-a18552c283bb',
  'category': 'Temporarily unable to work'},
 {'question': 'What is statutory sick pay?',
  'answer': "Statutory Sick Pay is a legal requirement for employers to pay you if you're ill.",
  'section': 'general claim benefits',
  'id': 'ee6a77ec-d4ef-50f7-9762-04cc228b3a48',
  'category': 'Temporarily unable to work'},
 {'question': 'How do I claim SSP?',
  'answer': 'Statutory Sick Pay is claimed through your employer if you’re too ill to work.',
  'section': 'general claim benefits',
  'id': 'fa497683-9615-5

## RAG flow with Vector Search (encoded question and answer)

In [19]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in United Kingdom Benefit Claims and Medical Negligence Claims. Answer the QUESTION based on the CONTEXT from 
the FAQ databases of Benefits database and NHS claims management. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"category: {doc['category']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\nsection: {doc['section']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [20]:
client = OpenAI()

def llm(prompt, model='gpt-40-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [21]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = question_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [22]:
rag(ground_truth[50])

"Yes, Statutory Sick Pay is available if you're self-isolating."

In [23]:
ground_truth[50]

{'question': 'Is Statutory Sick Pay applicable to those who are self-isolating?',
 'section': 'general claim benefits',
 'document': '1de35e0b-f233-554c-84ef-fc30494e0ea0'}

## Generating Answers for LLM comparison
- gpt-40
- gpt-4o-mini

## gpt-4o-mini

In [24]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = question_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [25]:
doc_idx = {d['id']: d for d in documents}

In [31]:
answers = {}

In [32]:
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'section': rec['section'],
    }

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 2055/2055 [30:32<00:00,  1.12it/s]


In [33]:
answers[2]

{'answer_llm': 'To update your benefit information, you can do so online through your account. If you need to update your bank details for benefit payments, you can also do this through your online benefits account or by contacting your benefit office. Additionally, if you need to report a change in your circumstances, you can do that through your online benefits account or by contacting your benefit office as well.',
 'answer_orig': 'You can update your benefit information online through your account.',
 'document': '30eada08-5708-5c5c-9df8-0f7d5d4dc131',
 'question': 'What is the process to update my benefit information?',
 'section': 'general claim benefits'}

In [34]:
results_gpt4o_mini = [None] * len(ground_truth)

for i, value in answers.items():
    results_gpt4o_mini[i] = value.copy()
    results_gpt4o_mini[i].update(ground_truth[i])

In [35]:
df_gpt4o_mini = pd.DataFrame(results_gpt4o_mini)

In [40]:
df_gpt4o_mini.to_csv('generate_data/resultsgpt4o-mini.csv', index=False)

## gpt-4o

In [41]:
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [42]:
answers = {}
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['answer']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'section': rec['section'],
    }

 91%|███████████████████████████████████████████████████████████████████████████████████████████▊         | 1867/2055 [32:34<03:16,  1.05s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
results_gpt4o = [None] * len(ground_truth)

for i, value in answers.items():
    results_gpt4o[i] = value.copy()
    results_gpt4o[i].update(ground_truth[i])

In [None]:
df_gpt4o = pd.DataFrame(results_gpt4o)

In [None]:
df_gpt4o.to_csv('generate_data/resultsgpt4o.csv', index=False)

# LLM as a Judge

In [None]:
prompt1_template=
"""
You are a judge evaluating the quality of a generated answer. Below is the original answer, 
a question generated based on this answer, and the final answer generated for that question. 
Your task is to evaluate the relevance and similarity of the generated answer compared to the original answer, and label it as one of the following:

- RELEVANT: The generated answer is mostly aligned with the original answer.
- PARTLY RELEVANT: The generated answer is somewhat related but misses key details.
- NON_RELEVANT: The generated answer is not aligned with the original answer.

Original Answer: 
<Insert Original Answer>

Generated Question: 
<Insert Generated Question>

Generated Answer: 
<Insert Generated Answer>

Provide your evaluation in JSON format:

{
  "original_answer": "<Insert Original Answer>",
  "generated_question": "<Insert Generated Question>",
  "generated_answer": "<Insert Generated Answer>",
  "evaluation": "<Insert RELEVANT, PARTLY RELEVANT, or NON_RELEVANT>"
}
""".strip()
prompt2_template=
"""
You are a judge evaluating the quality of a generated answer based solely on the relevance to the given question. 
Below is the generated question and the final generated answer. 
Your task is to assess how well the generated answer responds to the question and label it as one of the following:

- RELEVANT: The generated answer fully addresses the question.
- PARTLY RELEVANT: The generated answer addresses the question but misses some key points.
- NON_RELEVANT: The generated answer does not address the question.

Generated Question: 
<Insert Generated Question>

Generated Answer: 
<Insert Generated Answer>

Provide your evaluation in JSON format:

{
  "generated_question": "<Insert Generated Question>",
  "generated_answer": "<Insert Generated Answer>",
  "evaluation": "<Insert RELEVANT, PARTLY RELEVANT, or NON_RELEVANT>"
}
""".strip()