In [386]:
import math
import json
import random
import requests

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from yaml import safe_load
from tqdm import tqdm
from typing import TypedDict

## Configuration

In [387]:
# Retrieve configuration file for elastic
with open('../../config/debug.config.yml') as config:
    config_file = safe_load(config)['elastic']

## Document Retrieval

In [388]:
# Initiate connection with Elasticsearch
es_client = Elasticsearch(
    f"{config_file['protocol']}://{config_file['host']}:{config_file['port']}",
    ca_certs='../../ca.crt',
    basic_auth=(config_file['user'], config_file['password'])
)

In [389]:
# Retrieve all ids and save them in a file
def retrieve_all_ids():
    mongo_ids = open('./data/mongo-ids.txt', mode='w+')

    for hit in tqdm(scan(es_client, index='apis', query={'query': {'match_all': {}}, 'fields': ['metadata.mongo-id', 'metadata.api.name'], "_source": False}), total=943254):
        if hit['fields']['metadata'][0]['api'][0]['name'] != '':
            mongo_ids.write(f"{hit['fields']['metadata'][0]['mongo-id'][0]}\n")

    mongo_ids.close()
    
retrieve_all_ids()

100%|██████████| 943254/943254 [05:51<00:00, 2682.81it/s]


In [390]:
class Document(TypedDict):
    masked_words: str
    mangled_words: str


# Load saved ids and pseudo-randomly select `n`
def retrieve_documents(n: int = 1000, percentage_to_remove: int = 30, percentage_to_mangle: int = 30, seed: int = None) -> dict[str, Document]:
    retrieved: dict[str, Document] = {}
    ids = open('./data/mongo-ids.txt').readlines()

    if seed: random.seed(seed)
    random.shuffle(ids)

    for i in tqdm(range(n), desc="Retrieving documents"):
        res = requests.get(f'http://localhost:8080/api/v1/specification/{ids[i].strip()}')
        specification = json.dumps(res.json()['specification']).replace('\'', '\\\'').replace('\\\"', '\'')
        preprocessed = requests.post('http://localhost:8080/api/v1/preprocess', json={'query': specification}).json()['query']

        masked_words = mask_query(preprocessed, percentage_to_remove, seed=seed)
        mangled_words = mangle_query(preprocessed, percentage_to_mangle, seed=seed)

        retrieved[ids[i].strip()] = {
            "masked_words": masked_words,
            "mangled_words": mangled_words
        }

    return retrieved

## Masking

We randomly delete x% of the words or x% of the characters from the query and check if the system is still able to retrieve the correct document.

In [391]:
def mask_query(query: str, percentage_to_remove: int = 30, seed: int = None) -> str:
    if seed: random.seed(seed)

    split = query.split(" ")
    to_remove = random.sample(range(len(split)), math.floor(percentage_to_remove / 100 * len(split)))
    new_population = [ind for i, ind in enumerate(split) if i not in to_remove]

    return ' '.join(new_population)

## Mangling

We randomly change x% of the words or x% of the characters in the query and check if the system is still able to retrieve the correct document. If the mangling is done on the character level, then new random characters are inserted; if the mangling is done on the word level, synonyms are inserted.

In [392]:
def mangle_query(query: str, percentage_to_change: int = 30, seed: int = None) -> str:
    if seed: random.seed(seed)

    split = query.split(" ")
    
    return ""

In [394]:
from functools import reduce


# Population of 943'254, confidence level of 95%, and margin of error of 5% = 385 samples
def perform_evaluation(sample_size: int, percentage_to_remove: int = 30, percentage_to_mangle: int = 30, seed: int = None):
    documents = retrieve_documents(sample_size, percentage_to_remove, percentage_to_mangle, seed)
    positions = [0 for _ in range(sample_size)]
    
    for ind, key in tqdm(enumerate(documents), total=sample_size, desc="Computing results"):
        search_res = requests.post('http://localhost:8080/api/v1/search', json={
            'fragment': documents[key]['masked_words'],
            'filters': 'length>=0'
        }).json()
        
        for ind1, hit in enumerate(search_res):
            if hit['metadata']['mongo-id'] == key:
                positions[ind] = ind1
        
    print(reduce(lambda acc, x: acc+1 if x>0 else acc, positions, 0))

perform_evaluation(385, 80, 80)

Retrieving documents: 100%|██████████| 385/385 [14:43<00:00,  2.29s/it]  
Computing results: 100%|██████████| 385/385 [00:56<00:00,  6.77it/s]

0



