In [1]:
import pandas as pd
import minsearch
import json

from tqdm.auto import tqdm
from openai import OpenAI
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


## Load Document

In [2]:
df = pd.read_csv('notebooks/claims.csv')

In [3]:
documents = df.to_dict(orient='records')

In [4]:
documents[190]

{'Category': 'Low income',
 'Question': 'What is the Help to Save scheme?',
 'Answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
 'Section': 'general claim benefits'}

In [5]:
df_ground_truth = pd.read_csv('notebooks/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

## Minsearch

In [6]:
index = minsearch.Index(
    text_fields=["Question", "Answer", "Category"],
    keyword_fields=["Section"]
)
index.fit(documents)

<minsearch.Index at 0x7e2125da1160>

In [7]:
q = 'What is the Help to Save scheme?'

## Implement a LLM that accepts query

In [8]:
client = OpenAI()
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'The Help to Save scheme is a UK government initiative designed to encourage individuals with low incomes to save money. This program offers a 50% bonus on the amount saved, providing an incentive for those who might otherwise struggle to set aside funds. Here are the key details of the scheme:\n\n1. **Eligibility**: \n   - You must be receiving Working Tax Credit, or Child Tax Credit and be entitled to Working Tax Credit, or Universal Credit with an individual or household income of at least £658.64 over the last monthly assessment period.\n   - You must be a resident in the UK.\n\n2. **Account Features**:\n   - **Duration**: The Help to Save account lasts for four years from the date of opening.\n   - **Savings Limit**: You can save between £1 and £50 each calendar month.\n   - **Bonuses**: \n     - First Bonus: After the first two years, the government pays a 50% bonus on the highest amount saved.\n     - Second Bonus: After four years, the government pays a further 50% bonus on the

## Perform a search of the document

In [9]:
def search(query):
    boost = {'Question': 3.0, 'Answer': 0.5}

    results = index.search(
        query=query,
        filter_dict={'Section': 'general claim benefits'},
        boost_dict=boost,
        num_results=10
    )

    return results

In [10]:
search(q)

[{'Category': 'Low income',
  'Question': 'What is the Help to Save scheme?',
  'Answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
  'Section': 'general claim benefits'},
 {'Category': 'Looking for work',
  'Question': 'What is the Restart Scheme?',
  'Answer': 'The Restart Scheme offers tailored support to help long-term unemployed people find work.',
  'Section': 'general claim benefits'},
 {'Category': 'Looking for work',
  'Question': 'What is the Kickstart Scheme?',
  'Answer': 'The Kickstart Scheme provides funding to employers to create jobs for young people.',
  'Section': 'general claim benefits'},
 {'Category': 'Disabled or health condition',
  'Question': 'What is the Motability Scheme?',
  'Answer': 'The Motability Scheme helps disabled people lease a car scooter or powered wheelchair.',
  'Section': 'general claim benefits'},
 {'Category': 'Disabled or health condition',
  'Question': 'What is the Blue Badg

## RAG flow
- Building a prompt

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in United Kingdom Benefit Claims and Medical Negligence Claims. Answer the QUESTION based on the CONTEXT from 
the FAQ databases of Benefits database and NHS claims management. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {Question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"category: {doc['Category']}\nquestion: {doc['Question']}\nanswer: {doc['Answer']}\nsection: {doc['Section']}\n\n"
    
    prompt = prompt_template.format(Question=query, context=context).strip()
    
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
query = "What is the Help to Save scheme?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
rag(query)

'The Help to Save scheme is a government savings account designed for people on low incomes, which offers bonuses on your savings.'

## TextSearch: Elasticsearch

#### Indexing and Mapping elasticsearch

In [15]:
es_client = Elasticsearch('http://localhost:9200') 

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Answer": {"type": "text"},
            "Category": {"type": "text"},
            "Question": {"type": "text"},
            "Section": {"type": "keyword"} 
        }
    }
}

index_name = "benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'benefit-claims'})

In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 425/425 [00:06<00:00, 67.22it/s]


In [18]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Question^3", "Answer", "Category"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "Section": "general claim benefits"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [19]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [20]:
query = "What is the Help to Save scheme?"
rag(query)

'The Help to Save scheme is a government savings account designed for people on low incomes, offering bonuses on your savings.'

## Vector Search 

In [21]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [22]:
len(model.encode("Getting size of model dim"))

384

In [23]:
df = pd.read_csv('notebooks/claims.csv')
documents = df.to_dict(orient='records')

## Mapping and Index

In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Answer": {"type": "text"},
            "Category": {"type": "text"},
            "Question": {"type": "text"},
            "Section": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'benefit-claims'})

In [25]:
for doc in tqdm(documents):
    question = doc['Question']
    answer = doc['Answer']
    doc['question_answer_vector'] = model.encode(question + ' ' + answer)

    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 425/425 [00:15<00:00, 28.11it/s]


In [26]:
def elastic_search_knn(field, vector, section):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "Section": section
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["Answer", "Section", "Question", "Category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [34]:
def question_answer_vector_knn(q):
    question = q['Question']
    section = q['Section']

    v_q = model.encode(question)

    return elastic_search_knn('question_answer_vector', v_q, section)

## Perform a Vector Search

In [36]:
question_answer_vector_knn(dict(
    Question='What is the Local Welfare Assistance scheme?',
    Section='general claim benefits'
))

[{'Answer': 'The Local Welfare Assistance scheme provides emergency financial help often for things like food clothing and utilities.',
  'Category': 'Low income',
  'Question': 'What is the Local Welfare Assistance scheme?',
  'Section': 'general claim benefits'},
 {'Answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
  'Category': 'Low income',
  'Question': 'What is the Help to Save scheme?',
  'Section': 'general claim benefits'},
 {'Answer': 'The Hardship Fund provides financial assistance to those in urgent need often administered by local councils.',
  'Category': 'Low income',
  'Question': 'What is the Hardship Fund?',
  'Section': 'general claim benefits'},
 {'Answer': 'The Restart Scheme offers tailored support to help long-term unemployed people find work.',
  'Category': 'Looking for work',
  'Question': 'What is the Restart Scheme?',
  'Section': 'general claim benefits'},
 {'Answer': 'The Healthy Start Sche

## Rag flow with Vector Search

In [37]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in United Kingdom Benefit Claims and Medical Negligence Claims. Answer the QUESTION based on the CONTEXT from 
the FAQ databases of Benefits database and NHS claims management. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {Question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"category: {doc['Category']}\nquestion: {doc['Question']}\nanswer: {doc['Answer']}\nsection: {doc['Section']}\n\n"
    
    prompt = prompt_template.format(Question=query, context=context).strip()
    return prompt

In [41]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [44]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = question_answer_vector_knn(query)
    prompt = build_prompt(query['Question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [45]:
rag(documents[10])

"Yes, you can get sick pay if you're self-isolating. Statutory Sick Pay is available in this situation."

In [48]:
documents[10]['Answer']

"Yes Statutory Sick Pay is available if you're self-isolating."