# What

This notebook will be where the work for #165 will take place.

As per the issues there are three steps, getting embedding dataset, creating the vector database and then building the RAg on top.

## Modules

In [None]:
# engine
from engine.OpenAICaller import openAICaller
from viewer import Searching

# Third Party
import pandas as pd
import numpy as np

from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv('../../.env')

import openai

from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
import lancedb
import pyarrow as pa

import voyageai

import yaml

# built in
import os
from typing import Callable
from importlib import reload

reload(Searching)

# Getting embeddings ready


In [None]:
def read_embedding(file_name):

    embedding_folder = 'embeddings'

    return pd.read_pickle(os.path.join(embedding_folder, file_name))

## Safety issues

In [None]:
safety_issue_embeddings = read_embedding('voyageai_si_embeddings.pkl')
safety_issue_embeddings.rename(columns={'si_embedding': 'vector'}, inplace=True)
display(safety_issue_embeddings)
print(f"Number of unique reports: {len(set(safety_issue_embeddings['report_id']))}")

### Checking safety issues distrubitions

I have done a rerun of the safety issues and included all since 2000. This means I have alot more safety issues. I need to do a recheck to see if the safety issue extraction is good enough. I think it is quite variable as there seems to be 660ish where the last run through only had 560. More so I didn't think I was doing a run through where it would redo the safety issues.

In [None]:
safety_issue_embeddings['year'] = safety_issue_embeddings['report_id'].apply(lambda x: int(x[0:4]))

safety_issue_embeddings['year'].hist()


## Important text

In [None]:
important_text_embeddings = read_embedding('voyageai_important_text_embeddings.pkl')

important_text_embeddings

## Section text embeddings

In [None]:
section_text_embeddings = read_embedding('voyageai_section_embeddings.pkl')

print(f"Number of unique reports: {len(set(section_text_embeddings['report_id']))}")
section_text_embeddings 

# Creating vector database

In [None]:

test_searches = yaml.safe_load(open('data/evaluation_searches.yaml'))

In [None]:
uri = 'databases/safety_issue_rag-lancedb'
# uri = 'az://vectordb/lancedb'
db = lancedb.connect(uri)

In [None]:
vo = voyageai.Client()

def embed_query(text):

    return vo.embed(text, model="voyage-large-2-instruct", input_type="query", truncation=False).embeddings[0]

In [None]:
def table_search(table, query, limit = 100, type: str = ['hybrid', 'fts', 'vector']) -> pd.DataFrame:
    if type == 'hybrid':
        results = table.search((embed_query(query), query),  query_type='hybrid') \
            .metric("cosine") \
            .limit(limit) \
            .to_pandas()
        results.rename(columns={'_relevance_score': 'section_relevance_score'}, inplace = True)
    elif type == 'fts':
        results = table.search(query,  query_type='fts') \
            .limit(limit) \
            .to_pandas()
        results.rename(columns={'score': 'section_relevance_score'}, inplace = True)
    else: # type == 'vector'
        results = table.search(embed_query(query),  query_type='vector') \
            .metric("cosine") \
            .limit(limit) \
            .to_pandas()
        results.rename(columns={'_distance': 'section_relevance_score'}, inplace = True)
        results['section_relevance_score'] = 1 - results['section_relevance_score']

    results['section_relevance_score'] = (results['section_relevance_score'] - results['section_relevance_score'].min()) / (results['section_relevance_score'].max() - results['section_relevance_score'].min())

    return results

## basic safety issues

I am noticing a problem of it not really getting the safety issues which are relevant. One big thing is that it isnt even finding the exact same safety issue! This is a bit of a problem.

In [None]:
si_table = db.create_table('safety_issue_embeddings', data = safety_issue_embeddings, mode='overwrite')

si_table
si_table.create_fts_index("si", replace=True)

In [None]:
def safety_issue_search(query, limit = 100):
    simple_search = si_table.search(embed_query(query)) \
        .metric("cosine") \
        .where("year >= 2006 ", prefilter=True) \
        .limit(limit) \
        .to_pandas()

    return simple_search 

In [None]:
query = "Close proximity events at unattended aerodromes"

safety_issue_search(query).loc[0, 'year']

## Reranking of Safety issue search

In [None]:
def rerank_results(query, results, n_results = 10):
    reranked_results = pd.DataFrame(vo.rerank(query, results['si'].tolist(), model = "rerank-1", truncation = False, top_k=n_results).results)

    merged_df = reranked_results.merge(results, left_on='document', right_on='si')[['report_id', 'si', 'index', 'relevance_score', 'vector', 'year', '_distance']]
    merged_df.rename(columns={'index': 'previous_rank'}, inplace=True)
    return merged_df


In [None]:
query = "Close proximity incidents at unattended aerodromes"

results = safety_issue_search(query)
rerank_results(query, results, n_results = 20).query('report_id == "2008_001"')

## Adding search of reports text to help find relevant safety issues

In [None]:
report_sections_table = db.create_table('report_section_embeddings', data = section_text_embeddings.rename(columns={'section_text_embedding': 'vector'}), mode='overwrite')
print('Making fts index')
report_sections_table.create_fts_index('section_text', replace=True)

In [None]:
report_sections_search = lambda query, limit = 100, type = 'vector': table_search(query = query, table = report_sections_table, limit = limit, type = type)

In [None]:
results.loc[0, 'year']

In [None]:
query = "Vortex ring state (or 'settling with power')"

results = report_sections_search(query, limit = 50000, type = 'vector')
results

In [None]:
def get_reports_relevance(query):
    results = report_sections_search(query, limit = 50000, type = 'fts')
    results.sort_values(by='section_relevance_score', ascending=False, inplace=True)
    return results.groupby('report_id').head(50).groupby('report_id')['section_relevance_score'].mean().sort_values(ascending=False).to_dict()


get_reports_relevance(test_searches[0]['query'])

In [None]:
def safety_issue_search_with_report_relevance(query):
    safety_issues = safety_issue_search(query, limit = 500)

    reports_relevance = get_reports_relevance(query)

    safety_issues['_distance'] = safety_issues.apply(lambda row: row['_distance'] * (1 -reports_relevance[row['report_id']] if row['report_id'] in reports_relevance else 1), axis = 1)

    safety_issues.sort_values(by = '_distance', inplace=True)

    safety_issues.reset_index(drop=False, inplace=True)

    return safety_issues

safety_issue_search_with_report_relevance(test_searches[2]['query']).query('report_id == "2015_201"')

## Engine implementation

In [None]:
searchEngine.db.open_table('safety_issue_embeddings').to_pandas()

In [None]:
reload(Searching)
searchEngine = Searching.SearchEngine('../../viewer/vector_db')

In [None]:
searchEngineSearch = lambda query: searchEngine.search(
        Search(query,
               SearchSettings(Modes.all_modes, (2000, 2024))),
        with_rag = False
    ).getContext()

In [None]:
searchEngineSearch(test_searches[0]['query'])

## Vector Database search evaluation

There is a need to know how well a search is performing.
I can do this by having some exmaples of a search query, report etc and what we would expect to see in the results.

In [None]:
def NDCG(results: pd.DataFrame, relevant_reports: list, at = 20):
    '''
    Calculates the Normalized discounted cummulative gain.
    Arugments
    results - this should be a dataframe of all of the safety issues. The rank of the reports will be extracted from the first occurane of it in `report_id` column
    relevant_reports - A list of all of the relevant report ID. This is treated as binary
    at - The number of results to consider
    '''
    # display(relevant_reports)
    reports_rank = list(enumerate(results['report_id'].unique()))[:100]

    # display(reports_rank)

    reports_relevance = [(at/2 if (report_id in relevant_reports) else 0) for _, report_id in reports_rank]

    # display(reports_relevance)
    
    DCG = [(pow(2,relevance) - 1) / np.log2(rank+1) for rank, relevance in zip(range(1, len(reports_relevance)+1), reports_relevance)]
    # display(DCG)
    DCG = sum(DCG)

    IDCG = [(pow(2,(at/2))- 1)  / np.log2(rank+1) for rank in range(1, len(reports_rank)+1)]
    # display(IDCG)
    IDCG = sum(IDCG)
    # print(DCG, IDCG)
    return DCG / IDCG

def evaluate_search(search: dict, search_function: Callable[[str], pd.DataFrame], loss_function: Callable[[pd.DataFrame, list, int], float], valid_size = 20, verbose = False) -> float:

    search_results = search_function(search['query'])
    
    if not 'report_id' in search_results.columns:
        raise ValueError("Search results must have a 'report_id' column")

    expected_report_ids = set(search['expected_reports'])
    search_report_ids = set(search_results['report_id'].head(valid_size))

    score = loss_function(search_results, expected_report_ids, valid_size)

    percrent_present_reports = len(expected_report_ids.intersection(search_report_ids)) / len(expected_report_ids)
    if verbose:
        print(f"  Percentage of expected reports present in search results: {percrent_present_reports} with score: {score}")
        if percrent_present_reports != 1.0:
            misisng_reports = expected_report_ids.difference(search_report_ids)
            print(f"  Missing reports: {misisng_reports}")
            print(f"  These are at index {[search_results.report_id.ne(report_id).idxmin() for report_id in misisng_reports]}")
        
        display(search_results)
    
    return score
    

In [None]:
def evaluate_searches(searches, search_function, verbose=False):
    percents = []
    for i, search in enumerate(searches):
        if verbose:
            print(f"{i} Evaluating search: '{search['query']}'")
        percents.append(evaluate_search(search, search_function, loss_function=NDCG, verbose = verbose))

    return sum(percents) / len(percents)
    

In [None]:
def evaluate_search_functions(test_searches, search_functions):

   searches = []

   for search_function in (pbar := tqdm(search_functions)):
      pbar.set_description(f"Evaluating {search_function['name']}")
      searches.append({
         'search_function': search_function['name'],
         'score': evaluate_searches(test_searches, search_function['function'], verbose = False)
      })
   
   return pd.DataFrame(searches)

In [None]:
reload(Searching)
search_functions = [
    {
        'name': 'reranked_search',
        'function': lambda query: rerank_results(query, safety_issue_search(query, limit = 1000), n_results = 500)
    },
    {
        'name': 'report_relevance_search',
        'function': safety_issue_search_with_report_relevance
    },
    {
        'name': 'search_engine',
        'function': searchEngineSearch
    }
]

different_models = evaluate_search_functions(test_searches, search_functions)

# Get the best model
best_search_function_name = different_models.sort_values('score', ascending = False).head(1)['search_function'].values[0]

best_search_function = [search_function for search_function in search_functions if search_function['name'] == best_search_function_name][0]['function']

different_models

In [None]:
def get_rank_of_report(query, report_id, search_function):
    search_results = search_function(query)
    display(search_results.query('report_id == @report_id'))
    just_reports = search_results.drop_duplicates(subset = ['report_id']).reset_index(drop=True)
    display(just_reports.query('report_id == @report_id'))

In [None]:
get_rank_of_report(test_searches[1]['query'], '2014_005', best_search_function)

Each evaluation highlightes these problems:

- Missing older reports that might be significant as reports only go back as far as 2000
- Missing reports that dont mention the search query in the safety safety_issue_embeddings
- Relevant reports can get lost in the search (i.e with 2015_201 being at place 25).

The first issue will be ignored for now due to not being able to reasonably get text extraction.
The second issue can be fixed by adding more context to the search by including the whole report for a full text search of sorts.
The last issue is ranking issue as 25 is still pretty close to the top.

# Adding RAG ontop of the vector database

In [None]:
rag_prompt = lambda query, context: f"""
Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
My question is: {query}

Here are relevant safety issues as context:
{context}

It is important to provide references to specifc reports and safety issues in your answer.
"""

def rag_search(query, search_function: Callable[[str], pd.DataFrame]):

    print((f"Understanding query..."))

    formatted_query = openAICaller.query(
        system = """
You are a helpful agent inside a RAG system.

You will recieve a query from the user and return a query that should be sent to a vector database.

The database will search a dataset of safety issues from transport accident investigation reports.  It will use both embeddings and full text search.
""",
        user = query,
        model="gpt-4",
        temp = 0.0
    )
    print(f' Going to run query: "{formatted_query}"')

    print(f"Getting relevant safety issues...")
    
    search_results = search_function(formatted_query).head(50)
    with pd.option_context('display.max_rows', 20):
        display(search_results)

    user_message = "\n".join(f"{id} from report {report} with relevance {rel} - {si}" for id, report, si, rel in zip(search_results['safety_issue_id'], search_results['report_id'], search_results['si'], search_results['_distance'])) 

    print(f"Summarizing relevant safety issues...")
    response = openAICaller.query(
        system = """
You are a helpful AI that is part of a RAG system. You are going to help answer questions about transport accident investigations.

The questions are from investigators and researchers from the Transport Accident Investigation Commission. The context you will be given are safety issues extracted from all of TAICs reports.

A couple of useful defintions for you are:

Safety factor - Any (non-trivial) events or conditions, which increases safety risk. If they occurred in the future, these would
increase the likelihood of an occurrence, and/or the
severity of any adverse consequences associated with the
occurrence.

Safety issue - A safety factor that:
• can reasonably be regarded as having the
potential to adversely affect the safety of future
operations, and
• is characteristic of an organisation, a system, or an
operational environment at a specific point in time.
Safety Issues are derived from safety factors classified
either as Risk Controls or Organisational Influences.

Safety theme - Indication of recurring circumstances or causes, either across transport modes or over time. A safety theme may
cover a single safety issue, or two or more related safety
issues.  
""",       
    user=rag_prompt(query, user_message),
    model="gpt-4",
    temp = 0.2
    )
    return {
        'relevant_safety_issues': search_results,
        'response': response
    }

In [None]:
query = test_searches[1]['query']

results = rag_search(query, best_search_function)

view = results['relevant_safety_issues']

print(results['response'])