In [67]:
import re
import os
import sys
import json
import tarfile

from tqdm import tqdm
from nltk import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [91]:
def train_doc2vec(corpus, retrain=False):
    path = '../out/models/word2vec.model'
    
    if not os.path.isfile(path) or retrain:
        tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
        doc2vec = Doc2Vec(tagged_documents, epochs=100, workers=10)
        
        doc2vec.save(path)
    else:
        doc2vec = Doc2Vec.load(path)
    
    return doc2vec

In [52]:
def get_top_n_docs(model, names, versions, query, n=5):
    inferred_vec = model.infer_vector(clean(query))
    results = model.dv.most_similar([inferred_vec], topn=n)
    
    length_name = max([len(names[el[0]]) for el in results]) + 2
    length_version = max([len(versions[el[0]]) for el in results]) + 2
    print(f'These are the top {n} results of the query "{query}":\n')
    
    for ind, result in enumerate(results):
        print(f'\t{str(ind + 1) + ".": <2} {names[result[0]]: <{length_name}} v.{versions[result[0]]: <{length_version}} [{str(result[1] * 100)[:2]}%]')
    
    return results

In [4]:
def clean(raw):
    stemmer = PorterStemmer()
    stopword_set = set(stopwords.words('english'))
    
    words = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)', '', raw)
    words = re.sub(r'\\[\w]', ' ', words)
    
    words = re.sub(r'[^\w\s]+', ' ', words)
    words = re.findall(r'([a-z]+|[A-Z][a-z]+|[A-Z]+)', words)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in stopword_set]
    words = [stemmer.stem(word) for word in words]
    
    return words

In [36]:
specs = []
specs_names = []
specs_versions = []

tar = tarfile.open('../data/data.tar.gz', 'r:gz')
nl_tags = r'(?:\'|\")(?:description|name|title|summary)(?:\'|\"):\s(?:\'|\")([^\'\"]+)(?:\'|\")'


for spec in tqdm(tar.getmembers(), file=sys.stdout):
    file = tar.extractfile(spec)
    
    if file is not None:
        try:
            contents = json.load(file)
            
            specs_names.append(contents['info']['title'])
            specs_versions.append(contents['info']['version'])
            specs.append(clean(' '.join(re.findall(nl_tags, str(contents)))))
        except json.JSONDecodeError:
            continue

100%|██████████| 3996/3996 [02:27<00:00, 27.17it/s]


In [None]:
doc2vec_model = train_doc2vec(specs, retrain=True)

In [94]:
# country data standard api
# client to manage network / network manager
# football match predictor
# policy administration

get_top_n_docs(doc2vec_model, specs_names, specs_versions, "network manager", n=5);

These are the top 5 results of the query "network manager":

	1. NetworkManagementClient   v.2017-11-01   [69%]
	2. NetworkManagementClient   v.2017-09-01   [69%]
	3. Transform                 v.1.0.0        [68%]
	4. NetworkManagementClient   v.2018-01-01   [67%]
	5. NetworkManagementClient   v.2017-10-01   [67%]
