<a href="https://colab.research.google.com/github/AlbezJelt/compass-aligned-graph-embeddings/blob/main/notebooks/Cade.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install folium==0.2.1
!pip install git+https://github.com/valedica/gensim.git
!pip install -U cade

!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/wiki_walks_from_dbpedia.txt
!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/wikidata_walks_final.txt
!cat wiki_walks_from_dbpedia.txt wikidata_walks_final.txt >> compass.txt

In [2]:
from cade.cade import CADE
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from tqdm import tqdm

import collections
import warnings
import json
import requests
from IPython.display import clear_output

warnings.filterwarnings('ignore')

In [3]:
#load dictionary with labels
with requests.get("https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/wikidata_label_dictionary.json", "rt") as req:
  label_dict = json.loads(req.text)

In [4]:
class CustomCADE(CADE):
    def __init__(self, vocab, *args, **kwargs):
        CADE.__init__(self, *args, **kwargs)
        self.gvocab = vocab

    def train_model(self, sentences):
        model = None
        if self.compass == None or self.init_mode != "copy":
            model = Word2Vec(sg=self.sg, size=self.size, alpha=self.static_alpha, iter=self.static_iter,
                                negative=self.negative,
                                window=self.window, min_count=self.min_count, workers=self.workers)
            # Modified trim rule to load a custom dictionary for the compass
            trim_rule = self.internal_trimming_rule if self.compass != None or len(self.gvocab) != 0 else None
            model.build_vocab(sentences, trim_rule=trim_rule)
        if self.compass != None:
            model = self.initialize_from_compass(model)
        model.train(sentences, total_words=sum([len(s) for s in sentences]), epochs=model.iter, compute_loss=True)
        return model

In [5]:
import re
def create_vocabulary(compass_file:str, frequency:int, frequency_not:int):
    with open(compass_file, 'rt') as f:
        compass_corpus = map(lambda x: x.replace('\n', ''), f.readlines())
        compass_corpus = map(lambda x: x.split(' '), compass_corpus)
        compass_corpus = (item for l in compass_corpus for item in l)
        counter = collections.Counter(compass_corpus)
        same_as = [k for (k, v) in counter.items() if v >= frequency and re.match("^Q\d+$", k)]
        not_same_as = [j for (j, f) in counter.items() if f >= frequency_not and not re.match("^Q\d+$", j)]
        vocab = set(not_same_as + same_as)
        return list(vocab)

In [6]:
#train compass
# Vocabulary creation
compass_vocab = create_vocabulary('compass.txt', 5, 20)
aligner = CustomCADE(size=30, window=3, vocab=compass_vocab)
aligner.train_compass("compass.txt", overwrite=False)

Training the compass from scratch.


In [7]:
model1 = aligner.train_slice("wiki_walks_from_dbpedia.txt", save=True)
model2 = aligner.train_slice("wikidata_walks_final.txt", save=True)

Training embeddings: slice wiki_walks_from_dbpedia.txt.
Initializing embeddings from compass.
Training embeddings: slice wikidata_walks_final.txt.
Initializing embeddings from compass.


# not owl:sameAs filtro manuale


In [None]:
with open('wiki_walks_from_dbpedia.txt', 'rt') as f:
    compass_corpus = map(lambda x: x.replace('\n', ''), f.readlines())
    compass_corpus = list(map(lambda x: x.split(' '), compass_corpus))
    compass_entities = (item for l in compass_corpus for item in l)
    not_same_as = [j for j in compass_entities if not re.match("^Q\d+$", j)]
    counter_nsa = collections.Counter(not_same_as)

In [None]:
from pprint import pprint
keeped_entity = []
checked_items = list(enumerate([(k, v) for (k, v) in counter_nsa.items() if v >=20]))
for i, (entity, count) in checked_items:
  print(f"{i}/{len(checked_items)} - Most similar entities for {entity}:")
  pprint([label_dict[e] for (e, s) in model2.wv.similar_by_vector(model1.wv[entity], topn=20)])
  mantieni = input("Keep the entity? (S/N, default S) ")
  if mantieni.lower() == 's' or mantieni == '':
    keeped_entity.append(entity)
  clear_output(wait=True)

42/43 - Most similar entities for Category:Television_series_by_ITV_Studios:
['information system',
 'techopedia.com',
 'electrical connector',
 'push-button',
 'mouse button',
 'electronics',
 'plastic',
 'left mouse button',
 'taxonomic rank',
 'spatial arrangement',
 'Nintendo Entertainment System',
 'n-tuple',
 'Unix-like operating system',
 'machine',
 'computer case',
 'form',
 'film',
 'Motherboard',
 'Nintendo',
 'Philips']
Keep the entity? (S/N, default S)n


In [None]:
with open('keeped_not_same_as_entity.txt', 'wt') as f:
  f.writelines(map(lambda x: f"{x}\n", keeped_entity))

# Caricamento vocaboli già filtrati

In [14]:
!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/keeped_not_same_as_entity.txt

--2022-02-18 18:03:51--  https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/keeped_not_same_as_entity.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 439 [text/plain]
Saving to: ‘keeped_not_same_as_entity.txt’


2022-02-18 18:03:51 (12.2 MB/s) - ‘keeped_not_same_as_entity.txt’ saved [439/439]



In [18]:
with open('keeped_not_same_as_entity.txt', 'rt') as f:
    nsa = map(lambda x: x.replace('\n', ''), f.readlines())
    keeped_entity = list(map(lambda x: x.split(' '), nsa))
    keeped_entity = (item for l in keeped_entity for item in l)

<generator object <genexpr> at 0x7f4af5877e50>

# Train dei modelli finali

In [19]:
compass_vocab = create_vocabulary('compass.txt', 5, float('inf'))
compass_vocab.extend(keeped_entity)
aligner = CustomCADE(size=30, window=3, vocab=compass_vocab)
aligner.train_compass("compass.txt", overwrite=False)
model1 = aligner.train_slice("wiki_walks_from_dbpedia.txt", save=True)
model2 = aligner.train_slice("wikidata_walks_final.txt", save=True)

Training the compass from scratch.
Training embeddings: slice wiki_walks_from_dbpedia.txt.
Initializing embeddings from compass.
Training embeddings: slice wikidata_walks_final.txt.
Initializing embeddings from compass.


# Valutazione dei match, ordinati per conteggio totale delle entità

In [None]:
with open('compass.txt', 'rt') as f:
    compass_corpus = map(lambda x: x.replace('\n', ''), f.readlines())
    compass_corpus = list(map(lambda x: x.split(' '), compass_corpus))
    compass_entities = [item for l in compass_corpus for item in l]
    counter = collections.Counter(compass_entities)

In [None]:
# Entity shared by model1 and model2 vocabulary, ordered by total count in compass.txt
counter_shared = [(k, v) for (k, v) in counter.most_common(len(counter.keys())) if k in model1.wv and k in model2.wv]

In [None]:
similarity_matches_m1_to_m2 = [
    (k, v) 
    for (k, v) 
    in counter_shared 
    if k in (
        e
        for (e, similarity)
        in model2.wv.similar_by_vector(model1[k], topn=5)
    )
]

similarity_matches_m1_to_m2

[('Q11168', 72), ('Q248', 10), ('Q388', 10), ('Q1384', 5)]

In [None]:
similarity_matches_m2_to_m1 = [
    (k, v) 
    for (k, v) 
    in counter_shared 
    if k in (
        e
        for (e, similarity)
        in model1.wv.similar_by_vector(model2[k], topn=5)
    )
]

similarity_matches_m2_to_m1

[('Q349', 927),
 ('Q11410', 129),
 ('Q782919', 122),
 ('Q5830907', 119),
 ('Q362', 112),
 ('Q1194970', 51),
 ('Q9135', 16),
 ('Q173799', 6),
 ('Q127856', 5)]

# Valutazione dei match, basata su distanza nel grafo

In [None]:
entities_around = []
# Iter on every shared entity
for e in tqdm([k for (k, v) in counter_shared]):
    # Extract sentences with entity e
    sentences = [sentence for sentence in compass_corpus if e in sentence]
    e_in_s = []
    # Iter over every sentence extracted
    for s in sentences:
        # Filter out entity from sentence if not in model1 and model2
        # sf = [e for e in s if e in model1.wv and e in model2.wv]
        sf = s
        # Find indices for entity e
        indices = [i for i, value in enumerate(sf) if value == e]  
        # Iter over every indices founded  
        for i in indices:
            left = sf[max(i-2, 0):i] # Extract 2 entity from left
            right = sf[i:min(i+2, len(s))] # Extract 2 entity from right
            e_in_s = e_in_s + [l for l in left if not l == e] + [r for r in right if not r == e] # Combine the found entities
    # Each item is a tuple (entity, counter of entity with distance 2)
    entities_around.append((e, collections.Counter(e_in_s)))

100%|██████████| 186/186 [00:01<00:00, 105.94it/s]


In [None]:
# Now we can compare this lists with similarities
# Example
from pprint import pprint
pprint(entities_around[30])
pprint(model1.wv.similar_by_vector(entities_around[30][0]))

('Q484876',
 Counter({'Q94933': 51,
          'Q735267': 50,
          'Category:Positions_of_authority': 50,
          'Q6609399': 2,
          'Q7414': 1,
          'Q106075980': 1,
          'Q2996165': 1,
          'Q1961128': 1,
          'Q1777832': 1,
          'Q1404417': 1,
          'Q167037': 1,
          'Q6196402': 1,
          'Q133080': 1,
          'Q865588': 1,
          'Q1255921': 1,
          'Q5156251': 1,
          'Q5467169': 1,
          'Q5829580': 1,
          'Q6270693': 1}))
[('Q735267', 0.9858945608139038),
 ('Q94933', 0.9853776097297668),
 ('Category:Bank_robbery_in_fiction', 0.9402717351913452),
 ('Q17452', 0.8712246417999268),
 ('Q1853722', 0.8618626594543457),
 ('Q43134', 0.8591406941413879),
 ('Q5', 0.8497217297554016),
 ('Q15149723', 0.8457735180854797),
 ('Q2608796', 0.8359093070030212),
 ('Q5324150', 0.8292506337165833)]


# Valutazione per distanza delle parole

In [8]:
!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/models/wiki_walks_from_dbpedia.model
!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/models/wikidata_walks_final.model

--2022-02-18 17:58:49--  https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/models/wiki_walks_from_dbpedia.model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 311522 (304K) [application/octet-stream]
Saving to: ‘wiki_walks_from_dbpedia.model’


2022-02-18 17:58:49 (9.16 MB/s) - ‘wiki_walks_from_dbpedia.model’ saved [311522/311522]

--2022-02-18 17:58:49--  https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/models/wikidata_walks_final.model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting respons

In [9]:
model1 = Word2Vec.load("wiki_walks_from_dbpedia.model")
model2 = Word2Vec.load("wikidata_walks_final.model")

In [57]:
db_list=[]
with open('wiki_walks_from_dbpedia.txt', 'rt') as f:
    dbpedia = map(lambda x: x.replace('\n', ''), f.readlines())
    dbpedia = list(map(lambda x: x.split(' '), dbpedia))
    for l in dbpedia: 
      for item in l:
        if not re.match("^Q\d+$", item) and item in keeped_entity:
          db_list.append(item)
        elif re.match("^Q\d+$", item):
          db_list.append(item)

In [73]:
wiki_list=[]
with open('wikidata_walks_final.txt', 'rt') as f:
    wiki = map(lambda x: x.replace('\n', ''), f.readlines())
    wiki = list(map(lambda x: x.split(' '), wiki))
    for l in wiki:
      for item in l:
        wiki_list.append(item)

In [94]:
neighbour_wiki = []
neighbour_db = []
for index in range(0, len(wiki_list)):
  if wiki_list[index] == "Q300920":
    for ind in [index-3, index-2, index-1, index+1, index+2, index+3]:
      neighbour_wiki.append(wiki_list[ind])

for index in range(0, len(db_list)):
  if db_list[index] == "Q300920":
    for ind in [index-3, index-2, index-1, index+1, index+2, index+3]:
      neighbour_db.append(db_list[ind])

counter_wiki = collections.Counter(neighbour_wiki)
counter_db = collections.Counter(neighbour_db)


In [95]:
wiki_words=[]
db_words=[]
for l in counter_wiki.most_common(10):
  wiki_words.append(l[0])

for l in counter_db.most_common(10):
  db_words.append(l[0])

similar = []
for db in db_words:
  for wiki in wiki_words:
    sim = 1 - cosine(model1[db],model2[wiki])
    similar.append([label_dict[db], label_dict[wiki], sim])

In [96]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.0.2-py3-none-any.whl (149 kB)
[?25l[K     |██▏                             | 10 kB 18.8 MB/s eta 0:00:01[K     |████▍                           | 20 kB 16.1 MB/s eta 0:00:01[K     |██████▋                         | 30 kB 7.8 MB/s eta 0:00:01[K     |████████▊                       | 40 kB 8.6 MB/s eta 0:00:01[K     |███████████                     | 51 kB 4.4 MB/s eta 0:00:01[K     |█████████████▏                  | 61 kB 5.0 MB/s eta 0:00:01[K     |███████████████▎                | 71 kB 5.5 MB/s eta 0:00:01[K     |█████████████████▌              | 81 kB 5.2 MB/s eta 0:00:01[K     |███████████████████▊            | 92 kB 5.8 MB/s eta 0:00:01[K     |█████████████████████▉          | 102 kB 4.9 MB/s eta 0:00:01[K     |████████████████████████        | 112 kB 4.9 MB/s eta 0:00:01[K     |██████████████████████████▎     | 122 kB 4.9 MB/s eta 0:00:01[K     |████████████████████████████▍   | 133 kB 4.9 MB/s eta 0:00:0

In [97]:
import xlsxwriter
similar.insert(0,["Dbpedia", "Wikidata", "Similarity"])

with xlsxwriter.Workbook('similar.xlsx') as workbook:
    worksheet = workbook.add_worksheet()

    for row_num, data in enumerate(similar):
        worksheet.write_row(row_num, 0, data)