In [1]:
import requests
import json
import os
from networkx.readwrite import json_graph


In [2]:
def load_vocab_file(vocab_file):
    with open(vocab_file) as vocab:
        word_list = vocab.read().split('\n')
    return word_list


In [3]:
def get_4lang_for_word(word):
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    return requests.post("http://hlt.bme.hu/4lang/definition", data=json.dumps({'word': word}), headers=headers).json()


In [18]:
def get_parents(word, parent_dict):
    graph = get_4lang_for_word(word)
    parent_dict[word] = {}
    parent_dict[word]["first"] = []
    parent_dict[word]["second"] = []
    parent_dict[word]["third"] = []
    for edges in json_graph.adjacency.adjacency_graph(graph["word"]).edges(data=True):
        if edges[0].split('_')[0].lower() == word.lower() and edges[2]['color'] == 0:
            parent = edges[1].split('_')[0]
            parent_dict[word]["first"].append(parent)
            if parent not in parent_dict:
                get_parents(parent, parent_dict)
            parent_dict[word]["second"] += parent_dict[parent]['first']
            parent_dict[word]["third"] += parent_dict[parent]['second']
    parent_dict[word]["first"] = list(set(parent_dict[word]["first"]))
    parent_dict[word]["second"] = list(set(parent_dict[word]["second"]))
    parent_dict[word]["third"] = list(set(parent_dict[word]["third"]))


In [5]:
def export_parent_json(parent_json_file, parent_dict):
    with open(parent_json_file, 'w') as parent_file:
        parent_file.write(json.dumps(parent_dict))


In [48]:
vocab_file = "./data/vocab"
if os.path.exists(vocab_file):
    vocab = load_vocab_file(vocab_file)
    parents = {}
    
    for word in vocab:
        print(word)
        if word not in parents:
            get_parents(word, parents)
    export_parent_json("./data/parents.json", parents)
else:
    raise FileNotFoundError('Check the path')


''

In [35]:
tier_dict = {
    "first": 1,
    "second": 10,
    "third": 100
}


In [44]:
def find_ancestors(word1, word2, parend_dict, freq):
    word1_parents = parend_dict[word1]
    word2_parents = parend_dict[word2]
    common_ancestors = {}
    for tier1 in word1_parents:
        if word2 in word1_parents[tier1]:
            common_ancestors[word2] = 0  # tier_dict[tier1]
        if word1 in word2_parents[tier1]:
            common_ancestors[word1] = 0  # tier_dict[tier1]
        for tier2 in word2_parents:
            intersection = list(set(word1_parents[tier1]) & set(word2_parents[tier2]))
            for i in intersection:
                common_ancestors[i] = tier_dict[tier1] * tier_dict[tier2] * freq[i]
    return common_ancestors


In [37]:
def frequency(parent_dict):
    freq = {}
    for element in parent_dict:
        for tier in parent_dict[element]:
            for parent in parent_dict[element][tier]:
                if parent in freq:
                    freq[parent] += 1
                else:
                    freq[parent] = 1
    return freq


In [45]:
parents = json.loads(open("./data/parents.json").read())
frequency_dict = frequency(parents)
ancestor_dict = find_ancestors("cat", "dog", parents, frequency_dict)
print(ancestor_dict)
min(ancestor_dict, key=ancestor_dict.get)


{'AT': 377800, 'animal': 26400, 'HAS': 8221000, '=PAT': 3253000, 'move': 6520000, 'live': 3260000, 'singular': 39640000, 'third': 35530000}


'animal'