In [43]:
import os
import copy
import pickle
import numpy as np
import pandas as pd

import multiprocessing as mp
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# from utils import get_word_neighborhood, filter_neighborhood

In [7]:
os.path.exists('/data/zeste_cache/neighborhoods/space.pickle')

False

In [9]:
args_cach_path = '/data/zeste_cache/neighborhoods/'
args_numberbatch_pickle = '/home/semantic/harrando/zeste/numberbatch-en-19.08-en.pickle'

In [10]:
numberbatch = pickle.load(open(args_numberbatch_pickle, 'rb'))

In [78]:
nb_sim = numberbatch.similarity

In [87]:
def get_label_neighborhood(label, depth=2, cache_path=args_cach_path):
    # In case the requested label does not appear in the cache
    similarities = ['simple', 'compound', 'depth', 'harmonized']
    pickle_path = os.path.join(args_cach_path, label+'.pickle')
    
    if depth == 0 or not os.path.exists(pickle_path) or label not in numberbatch:
        return {}
    
    # Get immediate label neighborhood
    neighborhood = pickle.load(open(pickle_path, 'rb'))
    for node in neighborhood:
        # we add the possiblity of defining multiple similarity methods for nodes that are not directly connected
        # to the Label node
        neighborhood[node]['rels'] = [tuple(neighborhood[node]['rels'])]
        neighborhood[node]['sim'] = {sim:neighborhood[node]['sim'] for sim in similarities}
    
    # Connect to n-hops labels
    hops = 1
    to_visit_next = list(neighborhood.keys())
    while hops < depth:
        next_hop = []
        while len(to_visit_next) > 0:
            current_node = to_visit_next.pop()
            cnn = get_label_neighborhood(current_node, depth=1)
            for word in cnn:
                if word not in neighborhood:
                    neighborhood[word] = {'from':[], 'rels': [], 'sim':{}}
                    sim_dict = {sim: 0.0 for sim in similarities}
                else:
                    sim_dict = neighborhood[word]['sim']
                    
                neighborhood[word]['from'].append(current_node)
                neighborhood[word]['rels'].append(tuple(cnn[word]['rels']))
                if word in numberbatch:
                    sim_dict['simple'] = max(sim_dict['simple'], nb_sim(label, word))
                    sim_dict['depth']  = max(sim_dict['depth'], sim_dict['simple'] * 1 / (hops + 1))
                    if current_node in numberbatch:
                        sim_dict['compound'] = max(sim_dict['compound'], nb_sim(label, word))
                        sim_dict['harmonized'] = max(sim_dict['harmonized'], 2 * nb_sim(label, word)/ (nb_sim(label, current_node) + nb_sim(current_node, word)))
                    else:
                        sim_dict['compound'] = max(sim_dict['compound'], sim_dict['simple'])
                        sim_dict['harmonized'] = max(sim_dict['harmonized'], sim_dict['simple'])
            
                # print('From label' + label + '- current node:' + current_node + ', word:' + word)
                neighborhood[word]['sim'] = sim_dict
                next_hop.append(word)
        
        hops += 1
        to_visit_next = next_hop
        
    return neighborhood

In [89]:
r = get_label_neighborhood('star', depth=2)

In [97]:
def filter_neighborhoood(neighborhood_original, 
                         allowed_rels=['synonym', 'antonym'], 
                         sim='simple',
                         keep=100, 
                         thresh=None):
    neighborhood = copy.deepcopy(neighborhood_original)
    if allowed_rels != 'all':
        nodes = list(neighborhood.keys())
        for node in nodes:
            if not any(rel in rels for rel in allowed_rels for rels in neighborhood[node]['rels']):
                del neighborhood[node]
                continue
    
    if thresh == None:
        thresh = 1.
    
    if keep != 'all':
        all_scores = sorted([neighborhood[node]['sim'][sim] for node in neighborhood], reverse=True)
        print(all_scores[:110])
        kth_score = all_scores[min(len(all_scores)-1,keep)]
        print(kth_score)
        nodes = list(neighborhood.keys())
        for node in nodes:
            node_sim = neighborhood[node]['sim'][sim]
            if node_sim <= min(kth_score, thresh):
                del neighborhood[node]
                continue
    
    return neighborhood

In [99]:
redux = filter_neighborhoood(r, allowed_rels='all')

[1.0, 0.9970628, 0.991065, 0.98923564, 0.98899806, 0.9883191, 0.98230624, 0.9809682, 0.97515106, 0.9722187, 0.9590082, 0.95865506, 0.95774704, 0.95721626, 0.9516396, 0.95159054, 0.94491774, 0.94110346, 0.93531334, 0.93204933, 0.9286635, 0.9243724, 0.9229554, 0.91900975, 0.9142464, 0.91132057, 0.90653706, 0.9019644, 0.8971922, 0.89572644, 0.8931824, 0.89306283, 0.892256, 0.8921951, 0.88574344, 0.8855916, 0.8805282, 0.8760388, 0.8749514, 0.873517, 0.8730864, 0.87230134, 0.86890024, 0.86877006, 0.8665735, 0.8663357, 0.8659971, 0.8620571, 0.8611104, 0.86092305, 0.86090595, 0.8606245, 0.8604253, 0.8602502, 0.8602078, 0.8594413, 0.85936457, 0.855585, 0.85508317, 0.85476285, 0.85391784, 0.8532454, 0.8485104, 0.846359, 0.8462197, 0.84520453, 0.84094036, 0.840897, 0.8403935, 0.83785295, 0.836389, 0.8333921, 0.83120793, 0.83119875, 0.8289728, 0.8280857, 0.82803875, 0.82613385, 0.8250884, 0.82412505, 0.82260066, 0.8224486, 0.8207069, 0.8197836, 0.8173046, 0.8166029, 0.81611204, 0.8113464, 0.80807

In [85]:
len(redux)

19

In [70]:
len(r)

54588

In [96]:
{l:r[l]['sim']['simple'] for l in r if r[l]['sim']['simple'] > 0.5}

{'star': 1.0,
 'supernova_impostor': 0.7191753,
 'acronycal': 0.5891463,
 'astrophotometer': 0.51240826,
 'constellation': 0.5203971,
 'stellify': 0.7416384,
 'nanostar': 0.8611104,
 'red_dwarf': 0.8805282,
 'markab': 0.6154794,
 'binary_star': 0.83119875,
 'betelgeuse': 0.5314037,
 'subdwarf': 0.90653706,
 'starred': 0.75926024,
 'goldilocks_planet': 0.54290575,
 'stinkbush': 0.50916654,
 'birth_chart': 0.5112919,
 'merope': 0.5009295,
 'stellar_nursery': 0.72193485,
 'starfilled': 0.8931824,
 'alderamin': 0.8173046,
 'hyades': 0.50599515,
 'alpha_centauri_c': 0.91900975,
 'menkar': 0.82613385,
 'eltanin': 0.7092062,
 'quasistar': 0.82412505,
 'astrometrized': 0.5366986,
 'ultra_diffuse': 0.71753544,
 'lodestar': 0.51573724,
 'rogue_planet': 0.6971651,
 'telespectroscope': 0.5087934,
 "orion's_belt": 0.6734473,
 'helium_star': 0.8663357,
 'winter_triangle': 0.6756316,
 'startracker': 0.5017224,
 'intrinsic_brightness': 0.6921197,
 'alphard': 0.57461435,
 'mirfak': 0.82803875,
 'astrog

In [92]:
{l:redux[l]['sim'] for l in redux}

{'star': {'simple': 1.0,
  'compound': 1.0,
  'depth': 1.0,
  'harmonized': 6112.741689760856},
 'ancient': {'simple': 0.089166224,
  'compound': 0.089166224,
  'depth': 0.089166224,
  'harmonized': 4.5690916068909635},
 'celebrity': {'simple': 0.46681574,
  'compound': 0.46681574,
  'depth': 0.46681574,
  'harmonized': 4.039078170135874},
 'étoile': {'simple': 0.481418,
  'compound': 0.481418,
  'depth': 0.481418,
  'harmonized': 197.9684763143422},
 'lead': {'simple': 0.102748185,
  'compound': 0.102748185,
  'depth': 0.102748185,
  'harmonized': 22.425182381788623},
 'remain': {'simple': 0.04751907,
  'compound': 0.04751907,
  'depth': 0.04751907,
  'harmonized': 0.3322847204081507},
 'stay': {'simple': 0.083641715,
  'compound': 0.083641715,
  'depth': 0.083641715,
  'harmonized': 5.645859541390253},
 'ace': {'simple': 0.2783021,
  'compound': 0.2783021,
  'depth': 0.2783021,
  'harmonized': 3.249140353479099},
 'asterisk': {'simple': 0.40950486,
  'compound': 0.40950486,
  'depth'

In [45]:
r['space']

{'rels': [('atlocation', 'relatedto'),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto', 'antonym'),),
  (('incontextof',),),
  (('atlocation',),),
  (('relatedto',),),
  (('isa',),),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto',),),
  (('atlocation',),),
  (('isa', 'relatedto'),),
  (('atlocation', 'relatedto'),),
  (('relatedto',),),
  (('locatedat',),),
  (('atlocation', 'relatedto'),),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto', 'atlocation'),),
  (('relatedto',),),
  (('atlocation',),),
  (('relatedto', 'synonym', 'etymologicallyrelatedto', 'isa'),),
  (('relatedto',),),
  (('sameas',
    'locatedat',
    'etymologicallyderivedfrom',
    'atlocation',
    'etymologicallyderiving',
    'etymologicallyrelatedto',
    'synonym'),),
  (('relatedto',),),
  (('relatedto',),),
  (('relatedto',),),
  (('atlocation',),),
  (('relatedto',),),
  (('relatedto', 'atlocation'),),
  (('atlocation',),),
  (('relatedto'

In [38]:
for w in r:
    assert(len(r[w]['from']) == len(r[w]['rels']))

In [None]:
class LabelNeighborhood:
    def __init__(self, label, depth=0):
        self.label = label
        self.depth = depth
    
    def _generate_neighborhood(self):
        ns = [get_word_neighborhood(word, **self.params) for word in self.doc]
        
        neighborhood = ns[0].copy()
        for w, nn in zip(self.doc[1:], ns[1:]):
            for ww in nn:
                if ww in neighborhood:
                    neighborhood[ww]['from'].append(w)
                    neighborhood[ww]['rels'].extend(['<>'] + nn[ww]['rels'])
                    neighborhood[ww]['sim'] = {s: max(neighborhood[ww]['sim'][s], nn[ww]['sim'][s]) for s in neighborhood[ww]['sim']}
                else:
                    neighborhood[ww] = {}
                    neighborhood[ww]['from'] = [w]
                    neighborhood[ww]['rels'] = nn[ww]['rels']
                    neighborhood[ww]['sim']  = nn[ww]['sim']
        
        self.untrimmed_cgr = copy.deepcopy(neighborhood)
        
        nodes = list(neighborhood.keys())
        for node in nodes:
            if max(neighborhood[node]['sim'].values()) <= self.thresh:
                del neighborhood[node]

        if type(self.keep) == int:
            all_scores = [max(neighborhood[node]['sim'].values()) for node in neighborhood]
            all_nodes = list(neighborhood.keys())
            if self.keep < len(all_scores):
                lowest_score = sorted(all_scores, reverse=True)[self.keep]
                for node in all_nodes:
                    if max(neighborhood[node]['sim'].values()) <= lowest_score:
                        del neighborhood[node]

        self.cgr = neighborhood
        
        self._max_similarity = {}
        for s in ['simple', 'compound', 'depth', 'harmonized']:
            self._max_similarity[s] = sum([neighborhood[w]['sim'][s]**2 for w in neighborhood])
