In [1]:
import pandas as pd
from tqdm.auto import tqdm
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, BulkIndexError
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import json

In [2]:
with open('../data/flight_manuals_documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
documents[0]

{'manual_section': 'AOM Section 3',
 'scenario': 'Electrical Failure',
 'instructions': 'Advise cabin crew of situation, maintain visual contact with other aircraft.',
 'id': '61162cbd'}

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
v = model.encode('How to handle hydraulic failure during flight?')


In [6]:
len(v)

384

In [7]:
es_client = Elasticsearch('http://localhost:9200')

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "manual_section": {"type": "text"},
            "scenario": {"type": "text"},
            "instructions": {"type": "keyword"},
            "id": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "flight_manuals"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'flight_manuals'})

In [9]:
from tqdm.auto import tqdm


In [10]:
for doc in tqdm(documents):
    scenario = doc['scenario']
    instructions = doc['instructions']
    text = scenario + ". " +instructions
    doc['text'] = text
    doc['text_vector'] = model.encode(text)

  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/300 [00:00<?, ?it/s]

In [13]:
query = 'How to handle hydraulic failure during flight?'


In [14]:
v_q = model.encode(query)


In [15]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000

    }

    search_query = {
        "knn": knn,
        "_source": ["text", "scenario", "manual_section", "instructions", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [16]:
elastic_search_knn('text_vector', v_q)

[{'instructions': 'Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'scenario': 'Hydraulic Failure',
  'id': 'e0cf7641',
  'text': 'Hydraulic Failure. Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'manual_section': 'QRH Section 3'},
 {'instructions': 'Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'scenario': 'Hydraulic Failure',
  'id': '322dd456',
  'text': 'Hydraulic Failure. Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'manual_section': 'AOM Section 7'},
 {'instructions': 'Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'scenario': 'Hydraulic Failure',
  'id': 'fe79dc8e',
  'text': 'Hydraulic Failure. Limit control inputs to essential movements, monitor hydraulic pressure levels.',
  'manual_section': 'FCOM Section 3'},
 {'instructions': 'Limit control inputs to essential movements, monitor hydraulic

In [21]:
import pandas as pd


In [22]:
df_ground_truth = pd.read_csv('../data/ground-truth-data-flight-manuals.csv')


In [23]:
ground_truth = df_ground_truth.to_dict(orient='records')


In [24]:
ground_truth[0]

{'id': '61162cbd',
 'question': 'What specific instructions should I give to the cabin crew during an electrical failure?'}

In [25]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [26]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [45]:

def evaluate(ground_truth):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        vector_query = model.encode(q['question'])
        results = elastic_search_knn('text_vector', vector_query)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
            'hit_rate' :hit_rate(relevance_total),
            'mrr' : mrr(relevance_total)
            }


In [46]:
evaluate(ground_truth)

  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.5506666666666666, 'mrr': 0.2596333333333315}

### Lets try to evaluate more search results and find the best boosting parameters

In [94]:
def elastic_search_knn_boost(field, vector, boosts):

    fields_with_boosts = [f"{field}^{boosts[field]}" for field in boosts]

    search_query = {
        "size" : 5,
        "query" : {
            "script_score": {
                "query" : {
                    "match_all" : {}
                },
                "script":{
                    "source":"""
                        double cosine_similarity = cosineSimilarity(params.query_vector, 'text_vector') + 1.0;
                        return cosine_similarity
                    """,
                    "params":{
                        "query_vector":vector
                    }
                }
            }
        },
        "_source":["text", "scenario", "manual_section", "instructions", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [86]:
fields = ["text", "scenario", "manual_section", "instructions"]
best_boosts = None
best_metrics = {'hit_rate': 0, 'mrr': 0}

In [72]:
import random    

In [92]:
def evaluate_boosts(ground_truth, boosts):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        vector_query = model.encode(q['question'])
        results = elastic_search_knn_boost('text_vector', vector_query, boosts)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
            'hit_rate' :hit_rate(relevance_total),
            'mrr' : mrr(relevance_total)
            }

In [95]:
for i in range(10):
    boosts = {field: round(random.uniform(1, 3), 2) for field in fields}
    print(f"iteration: {i}, Boosts: {boosts}")

    #evaluate the current bossting params:
    metrics = evaluate_boosts(ground_truth, boosts)
    print(f"Hit_Rate: {metrics['hit_rate']}, MRR: {metrics['mrr']}")

    # check if the current metrics is better than best so far
    if (metrics['hit_rate'] > best_metrics['hit_rate']) or (metrics['hit_rate'] == best_metrics['hit_rate'] and metrics['mrr'] > best_metrics['mrr']):
        best_metrics = metrics
        best_boosts = boosts
    
    print("\nBest Boosting Parameters:")
    print(best_boosts)
    print(f"Best Hit Rate: {best_metrics['hit_rate']}, Best MRR: {best_metrics['mrr']}")


iteration: 0, Boosts: {'text': 1.67, 'scenario': 2.14, 'manual_section': 1.63, 'instructions': 2.63}


100%|██████████| 1500/1500 [02:50<00:00,  8.80it/s]


Hit_Rate: 0.5506666666666666, MRR: 0.2596333333333315

Best Boosting Parameters:
{'text': 1.67, 'scenario': 2.14, 'manual_section': 1.63, 'instructions': 2.63}
Best Hit Rate: 0.5506666666666666, Best MRR: 0.2596333333333315
iteration: 1, Boosts: {'text': 2.15, 'scenario': 2.66, 'manual_section': 2.37, 'instructions': 2.75}


100%|██████████| 1500/1500 [02:48<00:00,  8.93it/s]


Hit_Rate: 0.5506666666666666, MRR: 0.2596333333333315

Best Boosting Parameters:
{'text': 1.67, 'scenario': 2.14, 'manual_section': 1.63, 'instructions': 2.63}
Best Hit Rate: 0.5506666666666666, Best MRR: 0.2596333333333315
iteration: 2, Boosts: {'text': 1.49, 'scenario': 2.2, 'manual_section': 1.28, 'instructions': 1.25}


100%|██████████| 1500/1500 [02:57<00:00,  8.45it/s]


Hit_Rate: 0.5506666666666666, MRR: 0.2596333333333315

Best Boosting Parameters:
{'text': 1.67, 'scenario': 2.14, 'manual_section': 1.63, 'instructions': 2.63}
Best Hit Rate: 0.5506666666666666, Best MRR: 0.2596333333333315
iteration: 3, Boosts: {'text': 1.04, 'scenario': 1.56, 'manual_section': 2.13, 'instructions': 2.27}


100%|██████████| 1500/1500 [02:47<00:00,  8.94it/s]


Hit_Rate: 0.5506666666666666, MRR: 0.2596333333333315

Best Boosting Parameters:
{'text': 1.67, 'scenario': 2.14, 'manual_section': 1.63, 'instructions': 2.63}
Best Hit Rate: 0.5506666666666666, Best MRR: 0.2596333333333315
iteration: 4, Boosts: {'text': 1.22, 'scenario': 2.81, 'manual_section': 1.95, 'instructions': 2.61}


 44%|████▍     | 663/1500 [01:11<01:30,  9.28it/s]


KeyboardInterrupt: 