# Code for WikiData Based Adaptation
(partially redacted for anonymity.  Code as is requires large RAM for loading WikiData.  GPU makes Faiss search faster, but still manageable on CPU)

In [1]:
#standard
import jsonlines
import collections
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
np.set_printoptions(suppress=True, formatter={'float_kind':'{:f}'.format})
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import math
import copy

#unstandard
import wikipedia
import warnings
warnings.filterwarnings('ignore')
import faiss
#if GPU available
res = faiss.StandardGpuResources()


#for printing out first keys in dict
from itertools import islice
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

Loading faiss with AVX2 support.


In [2]:
def extract_values(dictionary):
    values = []
    for item in dictionary.values():
        vals_only = []
        for (prop, val) in item:
            
            #if val[0] == 'Q':
            vals_only.append(val)
        values.append(vals_only)
    return values

def extract_props(dictionary):
    props = []
    for item in dictionary.values():
        props_only = []
        for (prop, val) in item:
            
            #if val[0] == 'Q':
            props_only.append(prop)
        props.append(props_only)
    return props

def join(values):
    joined = [' '.join(vals) for vals in values]
    return joined

def load_obj(name ):
    with open('/fs/REDACTED/modulation/' + name + '.pkl', 'rb') as f:
        return pickle.load(f) 

def save_obj(obj, name ):
    with open('/fs/REDACTED/modulation/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [3]:
%%time
final = {}
count = 0
america, germany = [], []

with jsonlines.open('/fs/REDACTED/modulation/10-26-20-wikidata.jsonl') as reader:
        all_dict = {}
        count = 0
        
        for obj in reader:
            if obj['value'] == 'Q30':
                america.append(obj['title'])
            elif obj['value'] in ['Q183','Q32','Q40', 'Q39', 'Q347',
                                  'Q43287', 'Q27306', 'Q154195', 'Q41304', 'Q16957', 'Q713750', 
                                  'Q155570', 'Q12548','Q156199', 'Q151624', 'Q150981', 'Q159631', 
                                  'Q186320', 'Q20135', 'Q154741']:
                germany.append(obj['title']), 
            
            if obj['title'] not in all_dict.keys():
                all_dict[obj['title']] = list()
               
            all_dict[obj['title']].append([obj['property'],obj['value']])
            
            count+=1
            if count % 100000000 == 0:
                print (count)
 
print("finished") 

100000000
200000000
300000000
400000000
500000000
finished
CPU times: user 1h 4min 32s, sys: 4min 16s, total: 1h 8min 48s
Wall time: 1h 8min 47s


In [4]:
germany = set(germany)
america = set(america)
amer_dict = {}
for index, i in enumerate(america):
    amer_dict[i] = all_dict[i]
ger_dict = {}
for index, i in enumerate(america):
    ger_dict[i] = all_dict[i]

### Remove Overlap 

In [5]:
###### Avoid dual citizens
germany_variations = ['Q183','Q32','Q40', 'Q39', 'Q347',
                        'Q43287', 'Q27306', 'Q154195', 'Q41304', 'Q16957', 'Q713750', 
                        'Q155570', 'Q12548','Q156199', 'Q151624', 'Q150981', 'Q159631', 
                        'Q186320', 'Q20135', 'Q154741']

redacted_amer_dict = {}

for key,vals in amer_dict.items():
    german_flag = False
    for val in vals:
        if val[0] =='P27' and val[1] in germany_variations:
            german_flag = True
            break
    if not german_flag:
        redacted_amer_dict[key] = vals
        
redacted_ger_dict = {}
for key,vals in ger_dict.items():
    american_flag = False
    for val in vals:
        if val[0] =='P27' and val[1] =='Q30':
            american_flag = True
            break
    if not american_flag:
        redacted_ger_dict[key] = vals

In [6]:
%%time
#extract values from germans and americans
american_vals = extract_props(redacted_amer_dict) #formerly americans not amer_dict
print("americans", len(american_vals))
german_vals = extract_props(redacted_ger_dict)
print("germans", len(german_vals))

#combine vocab for vectorizer
all_vals = copy.deepcopy(american_vals)
all_vals.extend(german_vals)

americans 1029205
germans 675084
CPU times: user 15 s, sys: 456 ms, total: 15.5 s
Wall time: 15.5 s


### Vectorizer

In [9]:
%%time
#drop items that occur too rarely and too frequently
vectorizer = CountVectorizer(max_features = 1000)
# #create vocab on ALL the data, so it's consistent across sub-matrices
vectorizer.fit(join(all_vals)) #UPDATE all_vals to join germans and americans

# #passing in vectorizer into a function causes kernel crash
# #faiss requires dense matrices in float32
american_matrix = vectorizer.transform(join(american_vals))
american_matrix = american_matrix.toarray().astype('float32')

german_matrix = vectorizer.transform(join(german_vals))
german_matrix = german_matrix.toarray().astype('float32')

CPU times: user 32.1 s, sys: 4.4 s, total: 36.5 s
Wall time: 36.5 s


In [8]:
with open('/fs/REDACTED/modulation/america.txt', 'r') as f:
    data = f.read()
amer_veale_noc_list = data.split('\n')[:35]
amer_wiki_list = data.split('\n')[35:]

with open('/fs/REDACTED/modulation/germany.txt', 'r') as f:
    data = f.read()
ger_veale_noc_list = data.split('\n')[:20]
ger_wiki_list = data.split('\n')[20:]

In [10]:
#process the adaptation source and get their WikiData features

cnt=0
only_entities, amer_keys, ger_keys = [], [], []

for e in amer_entities:
    if e.strip() in amer_dict.keys():
        #print (e, ger_dict[e.strip()])
        only_entities.append(amer_dict[e.strip()])
        amer_keys.append(e.strip())
        
temp = []
for item in only_entities:
    valz = ([val[0] for val in item if val[0].startswith('P')])
    temp.append(valz)
    
america_entity_matrix = vectorizer.transform(join(temp))
america_entity_matrix = america_entity_matrix.toarray().astype('float32')

only_entities, ger_keys = [], []

cnt=0
for e in ger_entities:
    if e.strip() in ger_dict.keys():
        #print (e, ger_dict[e.strip()])
        only_entities.append(ger_dict[e.strip()])
        ger_keys.append(e.strip())
    else:    
        try:
            page = wikipedia.page(e.strip())
            page = page.title
            #page = re.sub(r'\(.*?\)', '', page)
            #page = page.strip()
            if page in ger_dict.keys():
                print('Found', page)
                cnt+=1
                only_entities.append(ger_dict[page])
                ger_keys.append(page)
        
        except:
            pass
              
        
temp = []
for item in only_entities:
    valz = ([val[0] for val in item if val[0].startswith('P')])
    temp.append(valz)
    
germany_entity_matrix = vectorizer.transform(join(temp))
germany_entity_matrix = germany_entity_matrix.toarray().astype('float32')

Found Germany
Found Switzerland
Found 24 (TV series)
Found Terminator: The Sarah Connor Chronicles
Found Hamburger
Found Germany
Found Luxembourg


### Faiss Search

In [11]:
%%time

def faiss_search(base_matrix, search_matrix):
    assert len(base_matrix[0]) == len(search_matrix[0])
        
    print ("Using Dimensions:", len(base_matrix[0]))
    d= len(base_matrix[0])

    index = faiss.IndexFlatL2(d)   # build the index  #change to index_flat if gpu
    #index = faiss.index_cpu_to_gpu(res, 0, index_flat) add if GPU
    print(index.is_trained)
    index.add(base_matrix)                  # add vectors to the index
    print(index.ntotal)

    k = 100                             # we want to see 100 nearest neighbors
    D, I = index.search(base_matrix[:5], k) # sanity check
    print("Nearest Neighbors ", I)
    print()
    print("Distance Value ", D)
    
    D, I = index.search(search_matrix, k)     # actual search
    print("Nearest Neighbors, Start: ", I[:5])                   # neighbors of the 5 first queries
    print()
    print("Nearest Neighbors, End: ", I[-5:])
    return(D, I)

D, I = faiss_search(american_matrix, germany_entity_matrix )
#D, I = faiss_search(german_matrix, american_matrix, )

Using Dimensions: 946
True
1029205
Nearest Neighbors  [[ 745784 1027325  756258  460071  742539  657431 1002839  837585  513671
   478412   22940  228222  263121   42517  889247  745852  660127  866333
   915013  571905  553352  479066  512423  331037  297586  189567  408768
   289135  167297  404013  825950  651302  930701  374385  939857  817234
   574676  959048   97378  905177  791865  738486  598066  578903  526844
   664853  498464  506865  461882  455037  288753  179666       0   54578
   130964   20779   51654   54769   54835   54962  132864   52174   52593
    51193  132945  130829  130302  122039   49827  117468   49953   16711
   118462   49965   15878   15179  116389   44484  115441  133408  121685
   116620  115543   56507  108246   42535    1637  109626  107449   13893
   102174  105963   40624   41644  111111  107153   42753   39956   10342
    96798]
 [  16116   26405   18888    6408      96   32389   18997   55011    9247
     7974     424   70915   68350   30431   647

Nearest Neighbors, Start:  [[ 896510  292616  308001  306058  285421 1024191 1007187  478825   63508
   221252  254598  329853  110150  995295  407883   61805  612412  874393
   294931  723860  439967  805952  851190  980673  327138 1003151  644086
   573031  787427   60125   24671  334210   74129   12420  782595  474994
   773662  371262  331137  925394  599489  998054  398972  567318  138228
   477521  586752  257075  874590  280774  938799  964525  394174  253483
   705923  418567  794214  164293  739471  422018  547376  881115  330537
   412156  456644  721750  669255   77029  172009   37786  418549  784340
    19764  867723  629213  912152  337275  953223  270652  657323   54157
   375265 1028808  346030  181422  905434   15007  222452  255288  736696
   345562  953224  449342  237989  400465  834803  170150  226151  271460
   982977]
 [ 684339  173362  409057  366281  910308  855785  764095  722825  674017
   283259  995880  998967   27182  993683  375674  610650  562490  875199


In [12]:
#INVERSE THE DICTIONARY AND VIEW RESULTS
EXPERIMENT_NAME = "germany_props_nn"

inv_d_a, inv_d_g = {}, {}


for index, key in enumerate(ger_keys): #americans.keys() optionally inverse
    inv_d_a[index] = key
    
for index, key in enumerate(redacted_amer_dict.keys()): #germans.keys()
    inv_d_g[index] = key

nearest_n_dict = {}
for index, nearest_n in enumerate(I):
    nearest_n_dict[inv_d_a[index]] = [inv_d_g[item] for item in nearest_n]
    

save_obj(nearest_n_dict, EXPERIMENT_NAME)

In [15]:
nearest_n_dict['Marlene Dietrich'][:10]

['George Tabori',
 'Nicolas Cage',
 'Carol Burnett',
 'Benicio del Toro',
 'Dick Van Dyke',
 'Nicole Kidman',
 'Tito Puente',
 'Ruby Dee',
 'Ben Affleck',
 'Milo≈° Forman']

In [None]:
wikidata_germany = load_obj('germany_props_nn')
wikidata_america = load_obj('america_props_nn')

veale_de, wiki_de, veale_us, wiki_us = [], [], [], []
for i,keys in wikidata_germany.items():
    if i in ger_veale_noc_list:
        veale_de.append([i, keys])
    elif i in ger_wiki_list:
        wiki_de.append([i, keys])
        
for i,keys in wikidata_america.items():
    if i in amer_veale_noc_list:
        veale_us.append([i, keys])
    elif i in amer_wiki_list:
        wiki_us.append([i, keys])

In [None]:
def write_file(name,data):
    with open(name, 'w') as f:
        for i in data:
            formatted = []
            count = 1
            title = re.sub(r'\(.*?\)', '', i[0])
            formatted.append('_'.join(title.split()))
            for item in i[1]:
                item = re.sub(r'\(.*?\)', '', item)
                formatted.extend(['_'.join(item.split()), str(1/count)])
                count+=1
            f.write('\t'.join(formatted)+'\n')

write_file('/fs/clip-quiz/REDACTED/wikidata_us_veale.txt', veale_us)
write_file('/fs/clip-quiz/REDACTED/wikidata_us_wiki.txt', wiki_us)
write_file('/fs/clip-quiz/REDACTED/wikidata_de_veale.txt', veale_de)
write_file('/fs/clip-quiz/REDACTED/wikidata_de_wiki.txt', wiki_de)