In [47]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm

In [48]:
wiki_es_2017 = pd.read_csv("enwiki.wikilink_graph.2017-03-01.csv/wikilink_graph.2017-03-01.csv", on_bad_lines = 'warn',delimiter="\t")

In [49]:
wiki_es_2017.shape

(150743638, 4)

In [50]:
import pickle
# Loading the seed 
with open('subsample_nodes.pkl', 'rb') as f:
    seed_nodes = pickle.load(f)

In [51]:
seed_nodes = set(map(eval, seed_nodes))

In [52]:
len(seed_nodes)

1000000

In [53]:
# Sampling all the edges that are present in our random sample of nodes
edges = wiki_es_2017[(wiki_es_2017['page_id_from'].isin(seed_nodes)) & (wiki_es_2017['page_id_to'].isin(seed_nodes))]

In [54]:
edges.shape

(13595337, 4)

In [55]:
edges.head()

Unnamed: 0,page_id_from,page_title_from,page_id_to,page_title_to
3,12,Anarchism,28357259,19th century philosophy
7,12,Anarchism,839656,Adolf Brand
8,12,Anarchism,2731583,Adolf Hitler
9,12,Anarchism,192008,Adolphe Thiers
11,12,Anarchism,30758,Age of Enlightenment


In [56]:
pageid_to_pagetitle = dict()
for page_id, page_title in zip(edges['page_id_from'], edges['page_title_from']):
    pageid_to_pagetitle[page_id] = page_title

In [57]:
for page_id, page_title in zip(edges['page_id_to'], edges['page_title_to']):
    pageid_to_pagetitle[page_id] = page_title

In [58]:
len(pageid_to_pagetitle)

1000000

In [59]:
pageid_to_pagetitle

{12: 'Anarchism',
 25: 'Autism',
 39: 'Albedo',
 290: 'A',
 303: 'Alabama',
 307: 'Abraham Lincoln',
 308: 'Aristotle',
 309: 'An American in Paris',
 316: 'Academy Award for Best Production Design',
 324: 'Academy Awards',
 336: 'Altruism',
 340: 'Alain Connes',
 344: 'Allan Dwan',
 358: 'Algeria',
 573: 'Alchemy',
 579: 'Alien',
 580: 'Astronomer',
 586: 'ASCII',
 590: 'Austin (disambiguation)',
 593: 'Animation',
 594: 'Apollo',
 599: 'Afroasiatic languages',
 600: 'Andorra',
 615: 'American Football Conference',
 620: 'Animal Farm',
 621: 'Amphibian',
 624: 'Alaska',
 628: 'Aldous Huxley',
 630: 'Ada',
 633: 'Algae',
 640: 'Appellate procedure in the United States',
 649: 'Arraignment',
 651: 'America the Beautiful',
 656: 'Acid',
 659: 'American National Standards Institute',
 666: 'Alkali metal',
 668: 'Argument form',
 670: 'Alphabet',
 673: 'Atomic number',
 676: 'Andrei Tarkovsky',
 677: 'Ambiguity',
 683: 'Adventure',
 689: 'Asia',
 690: 'Aruba',
 691: 'Articles of Confederat

In [60]:
pageid_to_nodeid = {k:idx for idx,k in enumerate(pageid_to_pagetitle.keys())}

In [68]:
nodeid_to_pageid = {v:k for k,v in pageid_to_nodeid.items()}

In [61]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(list(pageid_to_pagetitle.values()))

In [62]:
x = torch.tensor(embeddings)

In [63]:
x.size()

torch.Size([1000000, 384])

In [64]:
id_from = [pageid_to_nodeid[i] for i in edges['page_id_from']] 
id_to = [pageid_to_nodeid[i] for i in edges['page_id_to']] 

In [65]:
edge_index = torch.tensor([id_from,id_to])

In [66]:
edge_index

tensor([[     0,      0,      0,  ..., 926146, 926146, 926146],
        [619578,  83144, 174547,  ..., 159732,   5859,  32182]])

In [67]:
pageid_to_nodeid[28357259]

619578

In [69]:
titles = [[pageid_to_pagetitle[nodeid_to_pageid[i]]] for i in range(1_000_000)]

In [73]:
titles

[['Anarchism'],
 ['Autism'],
 ['Albedo'],
 ['A'],
 ['Alabama'],
 ['Abraham Lincoln'],
 ['Aristotle'],
 ['An American in Paris'],
 ['Academy Award for Best Production Design'],
 ['Academy Awards'],
 ['Altruism'],
 ['Alain Connes'],
 ['Allan Dwan'],
 ['Algeria'],
 ['Alchemy'],
 ['Alien'],
 ['Astronomer'],
 ['ASCII'],
 ['Austin (disambiguation)'],
 ['Animation'],
 ['Apollo'],
 ['Afroasiatic languages'],
 ['Andorra'],
 ['American Football Conference'],
 ['Animal Farm'],
 ['Amphibian'],
 ['Alaska'],
 ['Aldous Huxley'],
 ['Ada'],
 ['Algae'],
 ['Appellate procedure in the United States'],
 ['Arraignment'],
 ['America the Beautiful'],
 ['Acid'],
 ['American National Standards Institute'],
 ['Alkali metal'],
 ['Argument form'],
 ['Alphabet'],
 ['Atomic number'],
 ['Andrei Tarkovsky'],
 ['Ambiguity'],
 ['Adventure'],
 ['Asia'],
 ['Aruba'],
 ['Articles of Confederation'],
 ['Atlantic Ocean'],
 ['Arthur Schopenhauer'],
 ['Angola'],
 ['Demographics of Angola'],
 ['Android (robot)'],
 ['Albert Einst

In [74]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=edge_index, titles=titles)

In [75]:
file_path = "Wiki_en_small_MHRWS_1M_wtitles.pt"
torch.save(data, file_path)