In [160]:
import pickle
#import sklearn
#from sklearn import metrics
#from sklearn import linear_model
from sklearn.model_selection import train_test_split
import random
import numpy as np
#from wac import WAC
from tqdm.notebook import tqdm
import pandas as pd
#from sklearn import neural_network
#import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from collections import defaultdict as dd
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/crow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Initiate Datasets

In [2]:
wac2vec = pickle.load(open('ddata/clip.bertvocab.embeddings.513.pkl', 'rb'))
len(wac2vec)

30522

In [3]:
concr_scores = pickle.load(open('ddata/AC_ratings_google3m_koeper_SiW.pkl', 'rb'))
len(concr_scores)

2168990

## ConcreteAbstract Class

In [4]:
class ConcreteAbstract:
    def __init__(self, word_vectors, concr_scores, word_net):
        pass

### Build Abstraction Tree

In [5]:
import warnings

def init_abstraction_tree(min_rating=8):

    wac_words = list(wac2vec.keys())
    wn_words = set(i for i in wn.words())
    wn_wac_words = wn_words & set(wac_words)
    
    concr_scores_subset = concr_scores[concr_scores.RATING >= min_rating]
    leaf_words = [w for w in tqdm(wn_wac_words) if w in concr_scores_subset.index]

    # Get Leaf Synsets...
    leaf_synsets = [wn.synsets(w)[0] for w in leaf_words]

    # Initiate Abstraction Tree
    embeddings = [wac2vec[w] for w in leaf_words]
    data = {
        "SYNSET" : leaf_synsets,
        "WORD" : leaf_words,
        "DIST2LEAF": [0]*len(leaf_synsets),
        "NUM_LEAVES": [1]*len(leaf_synsets),
        "HYPERNYM": [[]]*len(leaf_synsets),
        "HYPONYMS": [[]]*len(leaf_synsets),
        "EMBEDDING" : embeddings
        
    }
    abstraction_tree = pd.DataFrame(data)
    abstraction_tree.index = abstraction_tree.SYNSET

    # Get True Leaf Synsets
    ancestors = set()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for s in leaf_synsets:
            #print(set(s.closure(lambda s: s.hypernyms())))
            ancestors = ancestors.union(set(s.closure(lambda s: s.hypernyms())))

    # Remove leaves that are ansestors of other leaves
    true_leaf_synsets = set(leaf_synsets) - ancestors
    ansestor_leaves = set(leaf_synsets) - true_leaf_synsets
    abstraction_tree.drop(ansestor_leaves, inplace=True)
    
    # Remove leaves that have the same Synset
    abstraction_tree.drop_duplicates(subset='SYNSET', inplace=True)
    
    return abstraction_tree

In [6]:
# Grow abstraction tree

def grow_abstraction_tree(abstraction_tree):
    """Takes an initial abstraction tree (containing only leaves) and grows
    the rest of the tree."""
    
    synset_list = list(abstraction_tree['SYNSET'])

    for s in tqdm(synset_list):
        h = s.hypernyms()

        if len(h) == 0:
            continue

        h = h[0]

        if h not in abstraction_tree.SYNSET:
            synset_list.append(h)
            abstraction_tree.loc[h] = [
                h,    # SYNSET
                None, # WORD
                0,    # DIST2LEAF
                0,    # NUM_LEAVES
                [],   # HYPERNYM
                [],   # HYPONYMS
                None  # EMBEDDING
            ]

        # Set DIST2LEAF
        s_dist = abstraction_tree.loc[s, 'DIST2LEAF']
        h_dist = abstraction_tree.loc[h, 'DIST2LEAF']
        #print(s)
        if s_dist >= h_dist:
            abstraction_tree.loc[h, 'DIST2LEAF'] = s_dist + 1

        # Set NUM_LEAVES
        s_num_leaves = abstraction_tree.loc[s, 'NUM_LEAVES']
        h_num_leaves = abstraction_tree.loc[h, 'NUM_LEAVES']
        abstraction_tree.loc[h, 'NUM_LEAVES'] = h_num_leaves + s_num_leaves

        # Add hypernym to synset
        abstraction_tree.loc[s, 'HYPERNYM'] = [h]

        # Add synset to hypernym
        abstraction_tree.loc[h, 'HYPONYMS'].append(s)

In [7]:
# Display abstraction tree
from nltk.tree import Tree

def build_display_tree(df, root_synset, char_limit=-1):
    """Display an abstraction tree starting with the root_synset.
    Returns an nltk Tree structure.
    Do not use on big trees!"""
    row = df.loc[root_synset]
    root_name = row['SYNSET'].lemmas()[0].name()[:char_limit]
    if len(row['HYPONYMS']) == 0:
        return root_name
    
    children = [build_display_tree(df, h, char_limit) for h in row['HYPONYMS']]
    
    return Tree(root_name, children)

In [8]:
abstraction_tree = init_abstraction_tree(9)

  0%|          | 0/14510 [00:00<?, ?it/s]

In [9]:
grow_abstraction_tree(abstraction_tree)

  0%|          | 0/17 [00:00<?, ?it/s]

In [10]:
#build_display_tree(abstraction_tree, wn.synset('entity.n.01'), char_limit=5)

In [11]:
# Get Positive Synsets

In [135]:
# Number of positive examples we want for each classifier
pos_count = 3
neg_count = 9

def is_leaf(synset):
    if synset not in abstraction_tree.index:
        return False
    return abstraction_tree.loc[synset, 'DIST2LEAF'] == 0

def classifier_capable(synset):
    return abstraction_tree.loc[synset, 'NUM_LEAVES'] >= pos_count

def embedding_capable(synset):
    """Return true if the synset is capable of having an embedding."""
    if synset not in abstraction_tree.index:
        return False
    return is_leaf(synset) or classifier_capable(synset)

def get_hyponyms(synset):
    """Return a list of hyponyms, or itself if there are none."""
    if synset not in abstraction_tree.index:
        return None
    hypos = abstraction_tree.loc[synset, 'HYPONYMS']
    if len(hypos) == 0:
        return [synset]
    else:
        return hypos

def count_embedding_capable(synset_list):
    """Given a list of synsets, returns a count of how many are capable of having an embedding."""
    return sum((embedding_capable(s) == True)*1 for s in synset_list)

def expand_hyponym_list(synset_list):
    hypos = []
    for s in synset_list:
        hypos += get_hyponyms(s)
    return hypos

def find_positive_examples(synset, depth=100):
    pos = get_hyponyms(synset)
    for _ in range(depth):
        pos = expand_hyponym_list(pos)
        if count_embedding_capable(pos) >= pos_count:
            return pos
    
    raise Exception("Reached depth of {} without finding enough positive example: {}".format(depth, synset))

In [136]:
find_positive_examples(wn.synset('entity.n.01'))

[Synset('cocoa.n.01'),
 Synset('pizza.n.01'),
 Synset('sausage.n.01'),
 Synset('solanaceous_vegetable.n.01'),
 Synset('rug.n.01'),
 Synset('shoe.n.01'),
 Synset('dressing.n.04'),
 Synset('watchband.n.01'),
 Synset('clothing.n.01'),
 Synset('chordate.n.01'),
 Synset('invertebrate.n.01'),
 Synset('vascular_plant.n.01'),
 Synset('flap.n.04')]

In [137]:
# Find negative examples...

In [138]:
def find_negative_examples(synset, pos_examples):
    # All synsets
    neg = np.array(abstraction_tree['SYNSET'])
    # Embedding capable synsets
    neg = neg[list(map(embedding_capable, neg))]
    # neg examples not in positive examples
    neg = set(neg) - set(pos_examples)
    neg_examples = random.sample(list(neg), k=min(neg_count, len(neg)))
    return neg_examples

In [124]:
def add_positive_negative_examples(synset):
    pos = find_positive_examples(synset)
    abstraction_tree.at[synset, 'POSITIVE'] = pos
    neg = find_negative_examples(synset, pos)
    abstraction_tree.at[synset, 'NEGATIVE'] = neg

In [139]:
def get_classifier_capable():
    """Get a list of synsets capable of having a classifier."""
    return [s for s in abstraction_tree['SYNSET'] if classifier_capable(s)]

In [141]:
def add_pos_neg_all():
    """Add positive and negative examples for each calssifier capable synset."""
    abstraction_tree['POSITIVE'] = [[]]*len(abstraction_tree)
    abstraction_tree['NEGATIVE'] = [[]]*len(abstraction_tree)
    for s in get_classifier_capable():
        add_positive_negative_examples(s)

In [142]:
add_pos_neg_all()

In [145]:
# Build Train/Test datasets

In [219]:
def build_train_test(synset):
    pos_examples = find_positive_examples(synset)
    neg_examples = find_negative_examples(synset, pos_examples)
    X = pos_examples + neg_examples
    y = list(np.ones(len(pos_examples))) + list(np.zeros(len(neg_examples)))
    return train_test_split(X, y, test_size=0.33, stratify=y)

In [225]:
def fill_out_train_test():
    abstraction_tree['X_TRAIN'] = [[]]*len(abstraction_tree)
    abstraction_tree['X_TEST']  = [[]]*len(abstraction_tree)
    abstraction_tree['Y_TRAIN'] = [[]]*len(abstraction_tree)
    abstraction_tree['Y_TEST']  = [[]]*len(abstraction_tree)
    
    synsets = get_classifier_capable()
    for s in synsets:
        X_train, X_test, y_train, y_test  = build_train_test(s)
        abstraction_tree.at[s, 'X_TRAIN'] = X_train
        abstraction_tree.at[s, 'X_TEST']  = X_test
        abstraction_tree.at[s, 'Y_TRAIN'] = y_train
        abstraction_tree.at[s, 'Y_TEST']  = y_test        

In [226]:
fill_out_train_test()

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
sum(y_train)

8.0

In [195]:
sum(y_train)

8.0

In [178]:
list(zip(y_train, X_train))

[(1.0, Synset('pizza.n.01')),
 (1.0, Synset('clothing.n.01')),
 (1.0, Synset('rug.n.01')),
 (0.0, Synset('bandage.n.01')),
 (0.0, Synset('crocodile.n.01')),
 (1.0, Synset('shoe.n.01')),
 (0.0, Synset('organism.n.01')),
 (0.0, Synset('object.n.01')),
 (0.0, Synset('gorilla.n.01')),
 (0.0, Synset('vertebrate.n.01')),
 (1.0, Synset('dressing.n.04')),
 (1.0, Synset('chordate.n.01')),
 (1.0, Synset('flap.n.04')),
 (1.0, Synset('sausage.n.01'))]