In [159]:
import json
import random
import requests

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from yaml import safe_load
from tqdm import tqdm

## Configuration

In [94]:
# Retrieve configuration file for elastic
with open('../../config/debug.config.yml') as config:
    config_file = safe_load(config)['elastic']

## Document Retrieval

In [95]:
# Initiate connection with Elasticsearch
es_client = Elasticsearch(
    f"{config_file['protocol']}://{config_file['host']}:{config_file['port']}",
    ca_certs='../../ca.crt',
    basic_auth=(config_file['user'], config_file['password'])
)

In [96]:
# Retrieve all ids and save them in a file
def retrieve_all_ids():
    mongo_ids = open('./data/mongo-ids.txt', mode='w+')

    for hit in tqdm(scan(es_client, index='apis', query={'query': {'match_all': {}}}), total=943254):
        mongo_ids.write(f"{hit['_source']['metadata']['mongo-id']}\n")

    mongo_ids.close()

100%|██████████| 943254/943254 [17:51<00:00, 880.11it/s] 


In [197]:
# Load saved ids and pseudo-randomly select `n`
def retrieve_documents(n: int = 1000, seed: int = None):
    retrieved: list[str] = []
    ids = open('./data/mongo-ids.txt').readlines()
    
    if seed: random.seed(seed)
    random.shuffle(ids)
    
    for i in tqdm(range(n)):
        res = requests.get(f'http://localhost:8080/api/v1/specification/{ids[i].strip()}')
        specification = json.dumps(res.json()['specification']).replace('\'', '\\\'').replace('\\\"', '\'')
        preprocessed = requests.post('http://localhost:8080/api/v1/preprocess', json={'query': specification}).json()['query']
        print(specification)
        print()
        print(preprocessed)
        
        retrieved.append(preprocessed)
        break
        
    return retrieved

In [198]:
documents = retrieve_documents(seed=100)

  0%|          | 0/1000 [00:00<?, ?it/s]


"{'openapi': '3.0.1','info': {'title': 'API Reference','version': 'v1'},'tags': [{'name': 'Account Management'}],'paths': {'/api/v1/ifi/{ifiID}/accounts': {'get': {'tags': ['Account Management'],'summary': 'Get all accounts of IFI','operationId': 'getAccounts','parameters': [{'name': 'ifiID','in': 'path','description': 'ID of the IFI (on-boarded organization) under which the accounts exist','required': true,'schema': {'type': 'integer','format': 'int64'},'example': {'$numberInt':'123123'}},{'name': 'pageNumber','in': 'query','required': true,'schema': {'type': 'integer','format': 'int64'}},{'name': 'pageSize','in': 'query','required': true,'schema': {'type': 'integer','format': 'int64'}},{'name': 'sortBy','in': 'query','schema': {'type': 'string'}},{'name': 'sortOrder','in': 'query','schema': {'type': 'string','enum': ['asc','desc']}},{'name': 'pageIndex','in': 'query','schema': {'type': 'string'}},{'name': 'lastFetchedId','in': 'query','schema': {'type': 'string'}}],'responses': {'200

In [166]:
documents[0]

<Response [404]>