In [1]:
import pickle
#import sklearn
#from sklearn import metrics
#from sklearn import linear_model
#import random
#import numpy as np
#from wac import WAC
from tqdm.notebook import tqdm
import pandas as pd
#from sklearn import neural_network
#import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from collections import defaultdict as dd
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/crow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Initiate Datasets

In [2]:
wac2vec = pickle.load(open('ddata/clip.bertvocab.embeddings.513.pkl', 'rb'))
len(wac2vec)

30522

In [3]:
concr_scores = pickle.load(open('ddata/AC_ratings_google3m_koeper_SiW.pkl', 'rb'))
len(concr_scores)

2168990

## ConcreteAbstract Class

In [4]:
class ConcreteAbstract:
    def __init__(self, word_vectors, concr_scores, word_net):
        pass

### Build Abstraction Tree

In [49]:
import warnings

def init_abstraction_tree(min_rating=8):

    wac_words = list(wac2vec.keys())
    wn_words = set(i for i in wn.words())
    wn_wac_words = wn_words & set(wac_words)
    
    concr_scores_subset = concr_scores[concr_scores.RATING >= min_rating]
    leaf_words = [w for w in tqdm(wn_wac_words) if w in concr_scores_subset.index]

    # Get Leaf Synsets...
    leaf_synsets = [wn.synsets(w)[0] for w in leaf_words]

    # Initiate Abstraction Tree
    embeddings = [wac2vec[w] for w in leaf_words]
    data = {
        "SYNSET" : leaf_synsets,
        "WORD" : leaf_words,
        "DIST2LEAF": [0]*len(leaf_synsets),
        "NUM_LEAVES": [1]*len(leaf_synsets),
        "HYPERNYM": [[]]*len(leaf_synsets),
        "HYPONYMS": [[]]*len(leaf_synsets),
        "EMBEDDING" : embeddings
        
    }
    abstraction_tree = pd.DataFrame(data)
    abstraction_tree.index = abstraction_tree.SYNSET

    # Get True Leaf Synsets
    ancestors = set()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for s in leaf_synsets:
            #print(set(s.closure(lambda s: s.hypernyms())))
            ancestors = ancestors.union(set(s.closure(lambda s: s.hypernyms())))

    # Remove leaves that are ansestors of other leaves
    true_leaf_synsets = set(leaf_synsets) - ancestors
    ansestor_leaves = set(leaf_synsets) - true_leaf_synsets
    abstraction_tree.drop(ansestor_leaves, inplace=True)
    
    # Remove leaves that have the same Synset
    abstraction_tree.drop_duplicates(subset='SYNSET', inplace=True)
    
    return abstraction_tree

In [9]:
# Grow abstraction tree

def grow_abstraction_tree(abstraction_tree):
    """Takes an initial abstraction tree (containing only leaves) and grows
    the rest of the tree."""
    
    synset_list = list(abstraction_tree['SYNSET'])

    for s in tqdm(synset_list):
        h = s.hypernyms()

        if len(h) == 0:
            continue

        h = h[0]

        if h not in abstraction_tree.SYNSET:
            synset_list.append(h)
            abstraction_tree.loc[h] = [
                h,    # SYNSET
                None, # WORD
                0,    # DIST2LEAF
                0,    # NUM_LEAVES
                [],   # HYPERNYM
                [],   # HYPONYMS
                None  # EMBEDDING
            ]

        # Set DIST2LEAF
        s_dist = abstraction_tree.loc[s, 'DIST2LEAF']
        h_dist = abstraction_tree.loc[h, 'DIST2LEAF']
        #print(s)
        if s_dist >= h_dist:
            abstraction_tree.loc[h, 'DIST2LEAF'] = s_dist + 1

        # Set NUM_LEAVES
        s_num_leaves = abstraction_tree.loc[s, 'NUM_LEAVES']
        h_num_leaves = abstraction_tree.loc[h, 'NUM_LEAVES']
        abstraction_tree.loc[h, 'NUM_LEAVES'] = h_num_leaves + s_num_leaves

        # Add hypernym to synset
        abstraction_tree.loc[s, 'HYPERNYM'] = [h]

        # Add synset to hypernym
        abstraction_tree.loc[h, 'HYPONYMS'].append(s)

In [16]:
# Display abstraction tree
from nltk.tree import Tree

def build_display_tree(df, root_synset, trim_name=-1):
    """Display an abstraction tree starting with the root_synset.
    Returns an nltk Tree structure.
    Do not use on big trees!"""
    row = df.loc[root_synset]
    #root_name = row['SYNSET'].name()[:5]
    root_name = row['SYNSET'].lemmas()[0].name()[:trim_name]
    if len(row['HYPONYMS']) == 0:
        return root_name
    
    children = [build_display_tree(df, h, trim_name) for h in row['HYPONYMS']]
    
    return Tree(root_name, children)

In [56]:
abstraction_tree = init_abstraction_tree(9)

  0%|          | 0/14510 [00:00<?, ?it/s]

In [57]:
grow_abstraction_tree(abstraction_tree)

  0%|          | 0/17 [00:00<?, ?it/s]

In [53]:
#build_display_tree(abstraction_tree, wn.synset('entity.n.01'), trim_name=5)