In [2]:
import pandas as pd
from evaluation_metrics import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

# Add the parent directory to the system path
sys.path.insert(0, parent_dir)


from src.client_modules.elastic_search.elastic_search_client import ElasticSearchClient
from src.client_modules.embeddings.azure_openai import AzureOpenAIEmbeddingModel

In [5]:
ground_truth = pd.read_csv("../../data/evaluation_ground_truth.csv")
ground_truth = ground_truth.to_dict("records")
ground_truth[0]

{'id': 'a7f6bd015c004e119b4a7851e6cd6be9',
 'question': 'Can you describe data abstraction?'}

In [6]:
def wrapper(search_query):

    def search_function(q):
        index_name = "cs-theory"
        es_client = ElasticSearchClient(port=9200)
        emb_model = AzureOpenAIEmbeddingModel()
        question = q["question"]
        vector_search_term = emb_model.get_embeddings([question])[0]
        search_query["query"]["script_score"]["script"]["params"]["query_vector"] = vector_search_term
        res = es_client.search(index_name=index_name, search_query=search_query)
        return res["hits"]["hits"]

    return search_function

In [7]:
search_query1 = {
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "dotProduct(params.query_vector, 'combined_qa_vector') + 1.0",
        "params": {
          "query_vector": "vector_search_term"
        }
      }
    }
  }
}

search_query2 = {
  "query": {
    "script_score": {
      "query": { "match_all": {} },
      "script": {
        "source": """
          0.5 * dotProduct(params.query_vector, 'combined_qa_vector') +
          0.25 * dotProduct(params.query_vector, 'answers_vector') +
          0.25 * dotProduct(params.query_vector, 'questions_vector') + 1.0
        """,
        "params": {
          "query_vector": "vector_search_term"
        }
      }
    }
  }
}

search_query3 = {
  "query": {
    "script_score": {
      "query": { "match_all": {} },
      "script": {
        "source": """
          (dotProduct(params.query_vector, 'combined_qa_vector') +
           dotProduct(params.query_vector, 'answers_vector') +
           dotProduct(params.query_vector, 'questions_vector')) / 3 + 1.0
        """,
        "params": {
          "query_vector": "vector_search_term"
        }
      }
    }
  }
}

In [8]:
search_function1 = wrapper(search_query1)
search_function2 = wrapper(search_query2)
search_function3 = wrapper(search_query3)

In [14]:
results1 = evaluate(ground_truth, search_function1)
print(f"Results1: {results1}")

Results1: {'hit_rate': 0.9969708443771299, 'mrr': 0.9174866122139872}


In [15]:
results2 = evaluate(ground_truth, search_function2)
print(f"Results2: {results2}")

Results2: {'hit_rate': 0.9954562665656949, 'mrr': 0.9129484382418872}


In [16]:
results3 = evaluate(ground_truth, search_function3)
print(f"Results3: {results3}")

100%|██████████| 2641/2641 [29:04<00:00,  1.51it/s]

Results3: {'hit_rate': 0.993563044301401, 'mrr': 0.9066280809938514}





In [21]:
import pandas as pd


pd.DataFrame([results1, results2, results3], index=["res1", "res2", "res3"])

Unnamed: 0,hit_rate,mrr
res1,0.996971,0.917487
res2,0.995456,0.912948
res3,0.993563,0.906628
