In [1]:
import argparse
import pandas as pd
import pickle
import json
import os
from collections import defaultdict
from sklearn.model_selection import train_test_split

def read_concepts(file):
    with open(file, 'r') as f:
        nouns = [line.strip() for line in f]
    return nouns


def word_feature_dic(evaluation_dataset_dirpath):
    if 'CSLB' in evaluation_dataset_dirpath:
        D_concept_features = defaultdict(list)
        D_index_feature = defaultdict(int)
        with open(os.path.join(evaluation_dataset_dirpath,'feature_matrix.dat'),'r') as inf:
            all_features = inf.readline().strip().split('\t')[1:]
            for INDEX,feature in enumerate(all_features):
                D_index_feature[INDEX]=feature
            all_concept_v = inf.readlines()
            for line in all_concept_v:
                line = line.strip().split('\t')
                concept = line[0]
                concept = concept.replace('_',' ')
                vector = line[1:]
                for n,v in enumerate(vector):
                    if v != str(0.0):
                        D_concept_features[concept].append(D_index_feature[n])
    
    elif 'WordNet' in evaluation_dataset_dirpath:
        with open(os.path.join(evaluation_dataset_dirpath,'WNdb-3.0/word_label.pkl'),'rb') as f:
            D_concept_features = pickle.load(f)

    elif 'BabelDomains' in evaluation_dataset_dirpath:
        D_concept_features = defaultdict(list)
        with open(os.path.join(evaluation_dataset_dirpath,'domain_vectors.txt'),'r') as inf:
            for line in inf.readlines():
                cols = line.strip().split('\t')
                domain = cols[0]
                concepts = [i.split(' ')[0] for i in cols[1:]]
                for concept in concepts:
                    D_concept_features[concept].append(domain)

    elif 'McRae' in evaluation_dataset_dirpath:
        D_concept_features = defaultdict(list)
        file = pd.read_csv(os.path.join(evaluation_dataset_dirpath,'McRae_new.csv'))
        features = list(file.columns)[1:]
        value_ls = file.values.tolist()
        for ls in value_ls:
            concept = ls[0]
            values = ls[1:]
            for inx,value in enumerate(values):
                if value!=0:
                    D_concept_features[concept].append(features[inx])

    return D_concept_features

In [2]:
def selected_word_featrue(D,bar,*embeddings):
    feature_word_d = defaultdict(list)
    for word in D.keys():
        found=True
        for idx,emb in enumerate(embeddings):
            if not word in emb:
                found=False
                #print(word,' not found in embedding at position ',idx)
                break
        if found:
            values = D[word]
            for value in values:
                feature_word_d[value].append(word)

    print(f'Found {len(feature_word_d)} features shared by all vectors')
    selected_word = set()
    selected_feature_word_d = defaultdict(list)
    for f,ws in feature_word_d.items():
        if len(ws) >= int(bar):
            selected_feature_word_d[f]=ws
            for w in ws:
                selected_word.add(w)
    return selected_feature_word_d, selected_word

In [3]:
w2v1 = json.load(open('/home/yixiao/Desktop/repo/word_embedding/baseline/Skip_gram.json'))
w2v2 = json.load(open('/home/yixiao/Desktop/repo/word_embedding/baseline/CBOW.json'))
wt_m = json.load(open('/home/yixiao/Desktop/repo/nn_classifier/data/embeddings/topic_bert_mask_large.json'))
g_m = json.load(open('/home/yixiao/Desktop/repo/word_embedding/general_vectors/masked/mask_bert_large.json'))

In [16]:
# D_data = word_feature_dic('/home/yixiao/Desktop/repo/evaluation_data/McRae_new')
# D_data = word_feature_dic('/home/yixiao/Desktop/repo/evaluation_data/CSLB_Property_Norms_V1')
# D_data = word_feature_dic('/home/yixiao/Desktop/repo/evaluation_data/WordNet-3.0')
D_data = word_feature_dic('/home/yixiao/Desktop/repo/evaluation_data/BabelDomains_full')

In [17]:
D_fea_words,sel_w = selected_word_featrue(D_data,
                                    100,
                                    w2v1,
                                    w2v2,
                                    wt_m,
                                    g_m)

Found 34 features shared by all vectors


In [18]:
with open('BD/all_words.txt','w') as outf:
    for word in set(sel_w):
        outf.write(word+'\n')