In [1]:
#standard
import jsonlines
import collections
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
np.set_printoptions(suppress=True, formatter={'float_kind':'{:f}'.format})
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

#unstandard
import faiss
#if GPU available
res = faiss.StandardGpuResources()

#for printing out first keys in dict
from itertools import islice
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

Loading faiss with AVX2 support.


In [None]:
def load_obj(name ):
    with open('/fs/clip-quiz/dpeskov/modulation/' + name + '.pkl', 'rb') as f:
        return pickle.load(f) 
loaded_d = load_obj("prop_val_dd")
print("Example WikiData:", take(1, loaded_d.items()))

In [3]:
#loop through WikiData and select items that have a given property
#default is US and German citizenship
def select_data(property1 = 'Q183', property2 = 'Q30'):
    tots, tots2 = 0, 0 
    germans, americans = {}, {}
    for key in loaded_d.keys():
        german_flag = False
        american_flag = False
        #all_dict[key] = list(set(all_dict[key]))
        for vals in loaded_d[key]:   
            if vals[0] =='P27' and vals[1] == property1:
                tots+=1
                german_flag = True
            if vals[0] =='P27' and vals[1] == property2:
                tots2+=1
                american_flag = True

        if german_flag:
            germans[key] = loaded_d[key]
        if american_flag:
            americans[key] = loaded_d[key]

    print(tots, tots2)
    print("German:", take(5, germans.items()))
    print("Americans:", take(5, americans.items()))
    return (germans, americans)

In [4]:
def extract_values(dictionary):
    values = []
    for item in dictionary.values():
        vals_only = []
        for (prop, val) in item:
            
            #if val[0] == 'Q':
            vals_only.append(val)
        values.append(vals_only)
    return values

def join(values):
    joined = [' '.join(vals) for vals in values]
    return joined

In [None]:
%%time
germans, americans = select_data()

#extract values from germans and americans
american_vals = extract_values(americans)
german_vals = extract_values(germans)

#extract values for all values for vocab training purposes
all_vals = extract_values(loaded_d)

In [7]:
%%time
#drop items that occur too rarely and too frequently
vectorizer = CountVectorizer(min_df=1000, max_df=1000000)
# #create vocab on ALL the data, so it's consistent across sub-matrices
vectorizer.fit(join(all_vals))

# #passing in vectorizer into a function causes kernel crash
# #faiss requires dense matrices in float32
american_matrix = vectorizer.transform(join(american_vals))
american_matrix = american_matrix.toarray().astype('float32')

german_matrix = vectorizer.transform(join(german_vals))
german_matrix = german_matrix.toarray().astype('float32')

CPU times: user 4min 22s, sys: 18.5 s, total: 4min 40s
Wall time: 4min 40s


In [8]:
%%time

def faiss_search(base_matrix, search_matrix):
    assert len(base_matrix[0]) == len(search_matrix[0])
        
    print ("Using Dimensions:", len(base_matrix[0]))
    d= len(base_matrix[0])

    index_flat = faiss.IndexFlatL2(d)   # build the index
    index = faiss.index_cpu_to_gpu(res, 0, index_flat)
    print(index.is_trained)
    index.add(base_matrix)                  # add vectors to the index
    print(index.ntotal)

    k = 6                             # we want to see 4 nearest neighbors
    D, I = index.search(base_matrix[:5], k) # sanity check
    print("Nearest Neighbors ", I)
    print()
    print("Distance Value ", D)
    
    D, I = index.search(search_matrix, k)     # actual search
    print("Nearest Neighbors, Start: ", I[:5])                   # neighbors of the 5 first queries
    print()
    print("Nearest Neighbors, End: ", I[-5:])
    return(D, I)

D, I = faiss_search(german_matrix, american_matrix)

Using Dimensions: 1774
True
46081
Nearest Neighbors  [[    0  1945 41530  1327  3427  1814]
 [    1 12895 13656    17 15009 22989]
 [    2 21530 22176 18741 40433 31854]
 [    3 19994 16331  3652 25033  3982]
 [    4  1686  4508 21507 21782 11931]]

Distance Value  [[0.000000 12009611460608.000000 12009614606336.000000
  12009614606336.000000 12009614606336.000000 12009614606336.000000]
 [0.000000 15.000000 15.000000 17.000000 17.000000 17.000000]
 [0.000000 2.000000 3.000000 3.000000 3.000000 3.000000]
 [0.000000 4.000000 4.000000 4.000000 4.000000 5.000000]
 [0.000000 0.000000 0.000000 0.000000 0.000000 0.000000]]
Nearest Neighbors, Start:  [[17928 30095 11195 21820 13857 13234]
 [31195 11800 31790 20364 20556 35522]
 [    0  1945 41530  1327  3427  1814]
 [20240  9272 38762  1753 21619 28147]
 [29614 38196  4646 14657 40440 29550]]

Nearest Neighbors, End:  [[ 9099 41357 14349 15903 27524 32256]
 [30161   902  5526  4882  3358  4116]
 [30161   902  5526  4882  3358  4116]
 [30161   

In [9]:
EXPERIMENT_NAME = "german_nn"

inv_d_a, inv_d_g = {}, {}
for index, key in enumerate(americans.keys()):
    inv_d_a[index] = key
    
for index, key in enumerate(germans.keys()):
    inv_d_g[index] = key

nearest_n_dict = {}
for index, nearest_n in enumerate(I):
    nearest_n_dict[inv_d_a[index]] = [inv_d_g[item] for item in nearest_n]
    
def save_obj(obj, name ):
    with open('/fs/clip-scratch/dpeskov/obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

save_obj(nearest_n_dict, EXPERIMENT_NAME)

In [50]:
examples = ind_mat[1000:1030]
for ex in examples:
    print (inv_d_a[ex], nearest_n_dict[inv_d_a[ex]] )

Marvin Mandel ['Moses Alexander', 'Christopher Memminger', 'Emil Anneke', 'Siegfried Guggenheim', 'Curt Teichert', 'John Peter Altgeld']
Jay Munly ['Kurt Jahnke', 'Heiner Friedrich', 'Dirk Dirksen', 'Bernhard Tessmann', 'Philip William August, Count Palatine of Neuburg', 'Gottfried Michaelsen']
Bob Bergen ['Robert Lindemann', 'Robert Fegg', 'Paulus Roetter', 'Kurt Jahnke', 'Gudo Hoegel', 'Robert Wiebking']
Eli Perry ['Moses Alexander', 'Anthony Eickhoff', 'Curt Teichert', 'Gustavus Sessinghaus', 'Simon Bamberger', 'Kurt Jahnke']
Julie Ditty ['Stephanie Gehrlein', 'Ria Sabay', 'Kirstin Freye', 'Antonia Matic', 'Svenja Weidemann', 'Justine Ozga']
Mark S. Schweiker ['Nicholas J. Rusch', 'Emil Anneke', 'Curt Teichert', 'Moses Alexander', 'Heiner Friedrich', 'Shawn Bradley']
Sam Langford ['Jürgen Brähmer', 'Rüdiger May', 'Dominik Britsch', 'Kurt Jahnke', 'Hamza Touba', 'Dimitri Sartison']
Mike Budenholzer ['Hurl Beechum', 'Denis Wucherer', 'Lucca Staiger', 'Thorsten Leibenath', 'Cody Topper

In [56]:
nearest_n_dict['Kobe Bryant']

['Demond Greene',
 'Sergio Kerusch',
 'Steffen Hamann',
 'Pascal Roller',
 'Hurl Beechum',
 'Sebastian Schmitt']

In [44]:
tots = 0
ind_mat = []
for index, i in enumerate(D):
    if i[0] == 3:
        tots+=1
        ind_mat.append(index)
print(tots)

79294


In [16]:
for key, vals in nearest_n_dict.items():
    print(vals)
    break

['Oscar Holderer', 'Gerhard Neumann', 'Gerhard Fischer (inventor)', 'August Schrader', 'Felix Salm-Salm', 'Fritz Mueller']


In [76]:
D[:5]
take(5, nearest_n_dict.items())

[('George Washington',
  ['Ivan Fioletov',
   'George Balanchine',
   'Ruslan Gelayev',
   'Bogdan Belsky',
   'Mikhail Batin',
   'Sergey Semyonovich Khabalov']),
 ('Larry Sanger',
  ['Prince Rostislav Romanov (born 1985)',
   'Vic Wild',
   'Leo II of Galicia',
   'Alexey Dyumin',
   'Maxim Lykov',
   'Aleksey Galkin']),
 (None,
  [None,
   'Vladimir Posner',
   'Anna Politkovskaya',
   'Sophie Shevardnadze',
   'Yulia Latynina',
   'Vladimir Putin']),
 ('Jenna Jameson',
  ['Irina Pantaeva',
   'Tatyana Kosmacheva',
   'Marina Aleksandrova',
   'Edelweiss (actress)',
   'Ludmilla Radchenko',
   'Anna Ukolova']),
 ('Bill Maher',
  ['Oleg Stefan',
   'Ivan Dobronravov',
   'Andrei Dementiev (actor)',
   'Maxim Munzuk',
   'Grigoriy Dobrygin',
   'Vitali Konyayev']),
 ('Joseph Brodsky',
  ['Andrey Krasko',
   'Roman Trakhtenberg',
   'Lev Gumilyov',
   'Elena Shvarts',
   'Yevgeny Rein',
   'Anatoly Naiman']),
 ('Gary Gygax',
  ['Ille Takhti',
   'Prince Rostislav Romanov (born 1985)',


In [None]:
%%time
#crashes for some reason

def dict_transform(dictionary, vocab):
    sparse_matrix = vocab.transform(join(dictionary))
    dense = sparse_matrix.toarray().astype('float32')
    return dense

american_matrix = dict_transform(american_vals, vectorizer)
german_matrix = dict_transform(german_vals, vectorizer)

In [None]:
inv_d = {}
for index, key in enumerate(loaded_d.keys()):
    inv_d[index] = key

nearest_n_dict = {}
for index, nearest_n in enumerate(I):
    nearest_n_dict[inv_d[index]] = [inv_d[item] for item in nearest_n]
    #print (inv_d[index])
    #print ("TOP 5: ", [inv_d[item] for item in nearest_n], '\n')

In [92]:
test= load_obj("all_data_nn")

In [103]:
test['Germany']

['Germany',
 'Italy',
 'Australia',
 'International E-road network',
 'European Union',
 'Brazil']

In [None]:
%%time
similarities = cosine_similarity(sparse_matrix, dense_output = False)
#print('pairwise dense output:\n {}\n'.format(similarities))

In [None]:
idx = (-similarities[99]).argsort()[:10]
idx

In [2]:
%%time
final = {}
count = 0

with jsonlines.open('/fs/clip-quiz/dpeskov/modulation/10-26-20-wikidata.jsonl') as reader:
        all_dict = {}
        count = 0
        
        for obj in reader:
            if obj['title'] not in all_dict.keys():
                #print(obj)
                all_dict[obj['title']] = list()
                #c = collections.Counter()
                #c = collections.defaultdict(int)
                #all_dict[obj['title']] = c
            all_dict[obj['title']].append([obj['property'],obj['value']])
            #all_dict[obj['title']][obj['property']]+=1
            count+=1
            if count % 100000000 == 0:
                print (count)
           # else:
             #   break

            
print("finished")            
#
# tots, tots2 = 0, 0                
# for key in all_dict.keys():
#     #all_dict[key] = list(set(all_dict[key]))
#     for vals in all_dict[key]:
#         #print(vals[0])
#         if vals[0] =='P27' and vals[1] == 'Q183':
#             #print (key)
#             tots+=1
#         if vals[0] =='P27' and vals[1] == 'Q30':
#             tots2+=1
#             break
# print(tots, tots2)

100000000
200000000
300000000
400000000
500000000
finished
CPU times: user 50min 4s, sys: 2min 57s, total: 53min 1s
Wall time: 53min


In [None]:
def load_obj(name ):
    with open('/fs/clip-scratch/dpeskov/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
loaded_d = load_obj("prop_val_dd")

In [None]:
len(loaded_d)

In [None]:
values = []
for item in loaded_d.values():
    vals_only = []
    for (prop, val) in item:
        if val[0] == 'Q':
            vals_only.append(val)
    values.append(vals_only)

In [None]:
type(values)
faiss.

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
flat_list = [item for sublist in values for item in sublist]
print(len(flat_list))
vocab = vectorizer.fit(flat_list)

In [None]:
joined = [' '.join(vals) for vals in values]

In [None]:
joined[:5]

In [None]:
%%time
vectorizer = CountVectorizer()
sparse_matrix = vectorizer.fit_transform(joined)

In [None]:
%%time
transformed = vocab.transform(joined)

In [None]:
%%time
docs = values[:5]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

sparse_matrix = csr_matrix((data, indices, indptr), dtype=int)

In [None]:
%%time
import sklearn.preprocessing as pp

def cosine_similarities(mat):
    col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat
similarities = cosine_similarities(sparse_matrix)
print('pairwise dense output:\n {}\n'.format(similarities))

In [None]:
numpy_array[0]

In [None]:
%%time
tots, tots2 = 0, 0                
ger, eng = {}, {}
for key in loaded_d.keys():
    #all_dict[key] = list(set(all_dict[key]))
    for vals in loaded_d[key]:
        #print(vals[0])
        if vals[0] =='P27' and vals[1] == 'Q183':
            #print (key)
            tots+=1
            ger
        if vals[0] =='P27' and vals[1] == 'Q30':
            tots2+=1
            break
print(tots, tots2)

In [None]:
with jsonlines.open('dd_prop_val.jsonl', 'w') as writer:
    writer.write_all(all_dict.items())

In [None]:
import jsonlines
with jsonlines.open('/fs/clip-scratch/dpeskov/obj/prop_val_dd.jsonl') as reader:
        all_dict = {}
        for obj in reader:
            print(obj)
            break
        

In [None]:
all_dict['Scotland']

In [None]:
with jsonlines.open('dd_prop_val.jsonl') as reader:
        for obj in reader:
            print(obj)

In [None]:
all_dict['Richard Wagner']

In [None]:
all_dict.keys()

In [3]:
def save_obj(obj, name ):
    with open('/fs/clip-quiz/dpeskov/modulation/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

save_obj(all_dict, "prop_val_dd")

In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('/fs/clip-quiz/dpeskov/modulation/GoogleNews-vectors-negative300.bin', binary=True)  
len([v for v in model.vocab])

In [None]:
word2vec_vocab = [v.lower() for v in model.vocab]

In [None]:
word2vec_dict = dict.fromkeys(word2vec_vocab)

In [None]:
loaded_vec = load_obj("binary_property_vector_sample")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy
#ary = scipy.spatial.distance.cdist(loaded_vec[key], loaded_vec.values(), metric='euclidean')
from numpy import dot
from numpy.linalg import norm

new_dict = {}

count = 0
for key in loaded_vec.keys():
    count +=1
    if count > 2:
        break
    for key2 in loaded_vec.keys():
        if key != key2:
            a = loaded_vec[key]
            b = loaded_vec[key]
            cos_sim = dot(a, b)/(norm(a)*norm(b))
            #ary = scipy.spatial.distance.cdist(loaded_vec[key], loaded_vec.values(), metric='euclidean')
            #print(cosine_similarity(loaded_vec[key], loaded_vec[key2]))
            

In [None]:
print(scipy.sparse.csr_matrix.todense(loaded_vec[key]))

In [None]:
loaded_vec[key].toarray()

In [None]:
np.array(list(loaded_vec.values()))

In [None]:
from scipy import sparse
import numpy as np
A_sparse = sparse.csr_matrix(loaded_vec.values())

In [None]:
count, not_count = 0, 0
for key in loaded_d.keys():
    #print(key.split()[0])
    if key:
        if key.split()[-1] in model.vocab:
            #print("FOUND", key)
            count +=1
        else:
            not_count +=1
            #print("NOT FOUND", key)
            #
        #if count >20:
        #    break
print(count, not_count)

In [None]:
def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
loaded_d = load_obj("propertydd")

In [None]:
loaded_d['Adolf Hitler']

In [None]:
for key in loaded_d.keys():
    loaded_d[key] = list(loaded_d[key].keys())

values = []
for item in loaded_d.values():
    values.extend(item)
values = list(set(values))    

vectorizer = CountVectorizer()
vectorizer.fit(values)

for key,val in loaded_d.items():
    print(key,val)
    
    item = vectorizer.transform(val)
    print((item).todense())
    print((item).todense().shape)
    loaded_d[key] = item
    break

In [None]:
print(list(loaded_d['Scotland'])[15])

In [None]:
count = 0
new_dict = {}
for key in loaded_d.keys():
    if count < 500000:
        new_dict[key] = list(loaded_d[key].keys())
        count+=1
    else:
        break
        
values = []
for item in new_dict.values():
    values.extend(item)
values = list(set(values))    

vectorizer = CountVectorizer()
vectorizer.fit(values)
#vector = vectorizer.transform(text)
for key,val in new_dict.items():
    item = vectorizer.transform(val)
    new_dict[key] = item

In [None]:
import numpy as np
np.savez_compressed("binary_property_vector.npy", loaded_d)

In [None]:
save_obj(new_dict, "binary_property_vector_sample")

In [None]:
print(hello)

In [None]:
df = pd.DataFrame.from_dict(all_dict)

In [None]:
df.apply(lambda x: np.array(x))

In [None]:
c.most_common()