In [11]:
import re
import sys
import json
import tarfile
import urllib3
import configparser

import tensorflow_hub as hub
import tensorflow.compat.v1 as tf

from tqdm import tqdm
from gensim.models import Doc2Vec
from elasticsearch import Elasticsearch

In [12]:
urllib3.disable_warnings()

config = configparser.ConfigParser()
config.read('../server.ini');

In [None]:
es = Elasticsearch(
    config['ELASTIC']['path'],
    verify_certs=False
)

In [14]:
es.info()

ObjectApiResponse({'name': 'a976e3f6f878', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'tSjSotunTNerL5c1VQMsPw', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [15]:
model = Doc2Vec.load('../out/models/word2vec.model')

In [7]:
specs = []
specs_names = []
specs_versions = []

tar = tarfile.open('../data/data.tar.gz', 'r:gz')
nl_tags = r'(?:\'|\")(?:description|name|title|summary)(?:\'|\"):\s(?:\'|\")([^\'\"]+)(?:\'|\")'

for spec in tqdm(tar.getmembers(), file=sys.stdout):
    file = tar.extractfile(spec)
    
    if file is not None:
        try:
            contents = json.load(file)
            
            specs_names.append(contents['info']['title'])
            specs_versions.append(contents['info']['version'] if not contents['info']['version'].startswith('v') else contents['info']['version'][1:])
            specs.append(' '.join(re.findall(nl_tags, str(contents))))
        except json.JSONDecodeError:
            continue

100%|██████████| 3996/3996 [00:13<00:00, 295.93it/s]


In [None]:
print("Downloading pre-trained embeddings from tensorflow hub...")
tf.disable_eager_execution()
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
text_ph = tf.placeholder(tf.string)
embeddings = embed(text_ph)
print("Done.")

In [None]:
print("Creating tensorflow session...")
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
session = tf.Session(config=config)
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print("Done.")

In [17]:
for ind in tqdm(range(len(specs))):
    es.index(
        index='oa-specifications',
        document= {
            'name': specs_names[ind],
            'version': specs_versions[ind],
            'document_vector': session.run(embeddings, feed_dict={text_ph: [specs[ind]]}).tolist()[0]
        }
    )

100%|██████████| 3992/3992 [00:41<00:00, 96.82it/s] 


In [0]:
query = 'backup client'

knn_query = {
    'field': 'document_vector',
    'query_vector': session.run(embeddings, feed_dict={text_ph: [query]}).tolist()[0],
    'k': 5,
    'num_candidates': 3992
}

results = es.knn_search(index='oa-specifications', knn=knn_query, source=['name', 'version'])['hits']['hits']

In [23]:
print(f'These are the top 5 results of the query "{query}":\n')

for ind, result in enumerate(results):
    length_name = max([len(el['_source']['name']) for el in results]) + 2
    length_version = max([len(el['_source']['version']) for el in results]) + 2

    print(f'\t{str(ind + 1) + ".": <2} {result["_source"]["name"]: <{length_name}} v.{result["_source"]["version"]: <{length_version}} [{str(result["_score"] * 100)[:2]}%]')

These are the top 5 results of the query "backup client":

	1. RecoveryServicesBackupClient   v.2016-06-01   [82%]
	2. BackupManagementClient         v.2016-05-01   [82%]
	3. BackupManagementClient         v.2018-09-01   [81%]
	4. BackupManagementClient         v.2018-09-01   [81%]
	5. BackupManagementClient         v.2016-05-01   [81%]


  results = es.knn_search(index='oa-specifications', knn=knn_query, source=['name', 'version'])['hits']['hits']
