In [1]:
from gensim.models import KeyedVectors
import logging
from time import time
from os.path import exists


def try_print(w2v, test_word):
    try:
        for word, score in w2v.most_similar(test_word):
            print(word, score)
    except:
        print("Warning: word '{}' not found.".format(test_word))
        
    
def load_and_pickle(w2v_fpath, binary=False):
    tic = time()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    w2v_pkl_fpath = w2v_fpath + ".pkl"

    if exists(w2v_pkl_fpath):
        w2v = KeyedVectors.load(w2v_pkl_fpath)
    else:
        w2v = KeyedVectors.load_word2vec_format(w2v_fpath, binary=binary, unicode_errors='ignore')
        w2v.init_sims(replace=True)
        try_print(w2v, "for")
        try_print(w2v, "для")
        w2v.save(w2v_pkl_fpath)
    
    print(time()- tic, "sec.")

    return w2v, w2v_pkl_fpath

w2v_en_original_fpath = "/home/panchenko/tmp/GoogleNews-vectors-negative300.txt"
w2v_ru_original_fpath = "/home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v"

# w2v_en, w2v_en_fpath = load_and_pickle(w2v_en_original_fpath)
# w2v_ru, w2v_ru_fpath = load_and_pickle(w2v_ru_original_fpath, binary=True)


In [16]:
s2v, s2v_pkl = load_and_pickle("/home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors")

try_print(s2v, "hV49368#0")


2018-06-07 17:11:13,304 : INFO : loading EuclideanKeyedVectors object from /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors.pkl
2018-06-07 17:11:13,490 : INFO : loading syn0 from /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors.pkl.syn0.npy with mmap=None
2018-06-07 17:11:13,568 : INFO : setting ignored attribute syn0norm to None
2018-06-07 17:11:13,569 : INFO : loaded /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv-1000-sum-score-20.sense_vectors.pkl
2018-06-07 17:11:13,576 : INFO : precomputing L2-norms of word weight vectors


0.2653944492340088 sec.
hV45689#0 0.9192017316818237
sV45983#0 0.9028427004814148
hV43990#0 0.8921349048614502
sV48807#0 0.6990944743156433
sV48540#0 0.6922570466995239
hV47962#0 0.6613180041313171
hV43059#0 0.6459588408470154
hV49073#0 0.6402440667152405
sV43611#0 0.6287655234336853
sV46623#0 0.6096590161323547


In [4]:
dsv.pcz.data["hV44013"][0]["cluster"]

Counter({'вредить#1': 0.501205,
         'говорить#1': 0.26525,
         'красить#2': 0.386612,
         'навредить#1': 0.501205,
         'опустить#1': 0.256847,
         'ругать#1': 0.253083})

In [1]:
import codecs
import operator
from multiprocessing import Pool
from vector_representations.dense_sense_vectors import DenseSenseVectors
from traceback import format_exc


def generate_binary_hypers(output_dir, max_synsets=1, hyper_synset_max_size=10, hc_max=0):
    output_fpath = output_dir + "vector-link-s%d-hmx%d-hc%d.csv" % (
        max_synsets, hyper_synset_max_size, hc_max)  
    bin_count = 0
    
    out = codecs.open(output_fpath, "w", "utf-8")
    log = codecs.open(output_fpath + ".log", "w", "utf-8")
    
    for i, h_id in enumerate(dsv.pcz.data):
        try:
            if i % 10000 == 0: print(i)

            if "h" in h_id:
                hypo_h_senses = dsv.pcz.data[h_id][0]["cluster"]
                tmp = sorted(dsv.pcz.data[h_id][0]["cluster"].items(), key=operator.itemgetter(1), reverse=True)

                s_id = "s" + h_id[1:]
                hypo_senses = dsv.pcz.data[s_id][0]["cluster"]
                log.write("\n{}\t{}\n".format(
                    h_id, ", ".join(hypo_h_senses)
                ))
                log.write("{}\n".format(
                    ", ".join(["{}:{}".format(k,v) for k,v in tmp])
                ))
                log.write("{}\t{}\n".format(
                    s_id, ", ".join(hypo_senses)
                ))

                # save relations from the hierarchical context 
                for hypo_sense in hypo_senses:
                    for hc_num, hyper_sense in enumerate(hypo_h_senses):
                        if hc_num == hc_max: break
                        hypo_word = hypo_sense.split("#")[0]
                        hyper_word = hyper_sense.split("#")[0]
                        if hypo_word != hyper_word:
                            out.write("{}\t{}\tfrom-original-labels\n".format(hypo_word, hyper_word))
                    bin_count += 1

                # save binary relations from a synset
                s_synsets = 0
                for rh_id, s in dsv.sense_vectors.most_similar(h_id + "#0"):
                    if "s" in rh_id:
                        hyper_senses = dsv.pcz.data[rh_id.split("#")[0]][0]["cluster"]
                        if len(hyper_senses) > hyper_synset_max_size: continue

                        rh_str = ", ".join(hyper_senses)
                        log.write("\t{}:{:.3f} {}\n".format(rh_id, s, rh_str))

                        for hypo_sense in hypo_senses:
                            for hyper_sense in hyper_senses:
                                hypo_word = hypo_sense.split("#")[0]
                                hyper_word = hyper_sense.split("#")[0]
                                if hypo_word != hyper_word:
                                    out.write("{}\t{}\tfrom-vector-linkage\n".format(hypo_word, hyper_word))
                                bin_count += 1
                        s_synsets += 1

                        if s_synsets >= max_synsets: break
        except KeyboardInterrupt:
            break
        except:
            print("Error", i, h_id)
            print(format_exc())
    out.close()
    log.close()
    
    print("# binary relations:", bin_count)
    print("binary relations:", output_fpath)
    print("log of binary relations:", output_fpath + ".log")
    
    return (bin_count, output_fpath)
    

output_dir = "/home/panchenko/tmp/vector-link/konvens/ru/"
pcz_fpath="/home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv"

reload = False
try: dsv
except NameError: reload = True

if reload:
    dsv = DenseSenseVectors(
        pcz_fpath=pcz_fpath,
        word_vectors_obj=None,
        save_pkl=True,
        sense_dim_num=1000,
        norm_type="sum",
        weight_type="score",
        max_cluster_words=20)
 
# for max_top_synsets in range(1,10):
#     for max_hyper_synset_size in [3, 5, 10, 15, 20]:
#         for hc_max in [1, 2, 3, 0]: 
#             p = (output_dir, max_top_synsets, max_hyper_synset_size, hc_max)
# with terminating(Pool(32)) as pool:
#     for res in pool.imap_unordered(runp, todo):
#         print res
     
for max_top_synsets in range(1,10):
    for max_hyper_synset_size in [3, 5, 10, 15, 20]:
        for hc_max in [1, 2, 3, 0]: 
            print("="*50)
            print("max number of synsets:", max_top_synsets)
            print("max hyper synset size:", max_hyper_synset_size)
            print("hc_max:", hc_max)
            generate_binary_hypers(output_dir, max_top_synsets, max_hyper_synset_size, hc_max)
            
            

Loading spacy model...
Loaded 72143 words from: /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv.pkl


KeyboardInterrupt: 

In [None]:
hs_type = "h"
min_size = 3
n = 0
for i, hs_id in enumerate(dsv.pcz.data):
    synset_len = len(dsv.pcz.data[hs_id][0]["cluster"])
    if synset_len >= min_size and hs_type in hs_id:
        print "\n", hs_id, ", ".join(dsv.pcz.data[hs_id][0]["cluster"])
        s_id = "s" + hs_id[1:]
        print s_id, ", ".join(dsv.pcz.data[s_id][0]["cluster"])
        for rhs_id, s in dsv.sense_vectors.most_similar(hs_id + "#0"):
            rhs_str = ", ".join(dsv.pcz.data[rhs_id.split("#")[0]][0]["cluster"])
            print "\t%s:%.3f %s" % (rhs_id, s, rhs_str)
        n += 1
    if n > 100: break

In [9]:
sorted(k for k in dsv.pcz.data.keys() if "h" in k)      

['h1',
 'h1#1',
 'hA100',
 'hA10003',
 'hA10005',
 'hA10007',
 'hA10011',
 'hA10014',
 'hA10025',
 'hA10037',
 'hA10047',
 'hA10061',
 'hA10068',
 'hA10071',
 'hA10073',
 'hA10078',
 'hA10079',
 'hA10086',
 'hA10099',
 'hA1010',
 'hA10103',
 'hA10111',
 'hA10113',
 'hA10114',
 'hA10115',
 'hA10117',
 'hA10125',
 'hA1013',
 'hA10130',
 'hA10131',
 'hA10133',
 'hA10134',
 'hA10138',
 'hA1015',
 'hA10152',
 'hA10156',
 'hA1016',
 'hA10162',
 'hA10170',
 'hA10177',
 'hA10178',
 'hA10183',
 'hA10187',
 'hA10195',
 'hA10206',
 'hA10208',
 'hA10210',
 'hA10212',
 'hA10213',
 'hA10215',
 'hA10222',
 'hA10226',
 'hA10232',
 'hA10239',
 'hA10245',
 'hA10246',
 'hA10251',
 'hA10256',
 'hA1026',
 'hA10268',
 'hA10271',
 'hA10272',
 'hA10279',
 'hA10281',
 'hA10283',
 'hA10287',
 'hA10290',
 'hA10291',
 'hA10294',
 'hA10297',
 'hA10298',
 'hA10299',
 'hA10309',
 'hA10311',
 'hA10312',
 'hA10314',
 'hA1032',
 'hA10338',
 'hA10343',
 'hA10344',
 'hA10346',
 'hA10362',
 'hA10364',
 'hA10366',
 'hA1036

In [None]:
from glob import glob 
from vector_representations.build_sense_vectors import run

for lang in ["ru", "en"]:
    sensegram_fpaths = "/home/panchenko/tmp/vector-link/konvens/{}/*-sensegram.tsv".format(lang)
    w2v_fpath = w2v_ru_original_fpath if "ru" else w2v_en_original_fpath 

    for inventory_fpath in glob(sensegram_fpaths):

        run(inventory_fpath, w2v_fpath) 
    
    


2018-06-07 18:26:33,085 : INFO : loading EuclideanKeyedVectors object from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl


Input PCZ: /home/panchenko/tmp/vector-link/konvens/ru/rwn-joint-tfidf-sensegram.tsv
Input word vectors: /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v
Sparse: False
Type of vector normalization: sum
Weight type: score
Max. number of cluster words to use: 20
Sense dim. number (sparse only): 1000
Save pickle (sparse only): False


2018-06-07 18:26:49,651 : INFO : loading syn0 from /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl.syn0.npy with mmap=None
2018-06-07 18:26:56,527 : INFO : setting ignored attribute syn0norm to None
2018-06-07 18:26:56,529 : INFO : loaded /home/panchenko/tmp/all.norm-sz500-w10-cb0-it3-min5.w2v.pkl
2018-06-07 18:26:56,530 : INFO : precomputing L2-norms of word weight vectors
