In [1]:
import csv

import numpy as np

In [3]:
# this is the directory for DGL repo, where we create `wiki` under their data folder 
# link to DGL-Knowledge Graph Embedding: https://github.com/dmlc/dgl/tree/master/apps/kg
to_save_dir = '../../dgl/apps/kg/data/wiki/'

In [4]:
# raw data directory
raw_dir = '../data/raw/'

In [9]:
entity_input_file = raw_dir + 'wikidata_20190805.item.csv'
entity_output_file = to_save_dir + 'entities.dict'
entity_keep_cols = [0, 3]
entity_text_format = "{}\t{}\n"
entity_skip_lines = 1
entity_dict = {}

In [10]:
edge_input_file = raw_dir + 'wikidata_20190805.property.csv'
edge_output_file = to_save_dir + 'relations.dict'
edge_keep_cols = [0, 1]
edge_text_format = "{}\t{}\n"
edge_skip_lines = 1 
edge_dict = {}

In [12]:
def write_to_file_with_format_and_dict(input_file, output_file, keep_cols, text_format, skip_lines = 0, dictionary = None):
    m = len(keep_cols)
    
    i = 0
    with open(input_file, 'r') as file_inp, open(output_file, 'w') as file_out:
        inp = csv.reader(file_inp)
        for _ in range(skip_lines):
            next(inp)
        for l in inp:
            to_write = [l[col] for col in keep_cols if l[col]]
            original_key = int(to_write[0])
            to_write[0] = i
            if len(to_write) < m:
                continue
            i += 1
            file_out.write(text_format.format(*to_write))
            dictionary[original_key] = to_write[1]

In [13]:
edge_mapping_dict = write_to_file_with_format_and_dict(
    input_file=edge_input_file,
    output_file=edge_output_file,
    keep_cols=edge_keep_cols,
    text_format=edge_text_format,
    skip_lines=edge_skip_lines,
    dictionary=edge_dict,
)

In [14]:
entity_mapping_dict = write_to_file_with_format_and_dict(
    input_file=entity_input_file,
    output_file=entity_output_file,
    keep_cols=entity_keep_cols,
    text_format=entity_text_format,
    skip_lines=entity_skip_lines,
    dictionary=entity_dict,
)

In [15]:
len(entity_dict)

8059838

In [16]:
len(edge_dict)

6490

In [15]:
!head ../data/wikipages_triplets.csv

source_item_id,edge_property_id,target_item_id,el_rank
1,793,323,1
1,793,837317,1
1,793,1208634,1
1,793,1079826,1
1,793,273508,1
1,793,1079806,1
1,793,3491753,1
1,31,36906466,1
1,910,5551050,1


In [17]:
def write_to_file_with_format_for_triplet(input_file, output_file, output_file2, output_file3, text_format,\
                                          skip_lines = 0, triplet_head_idx = 0, triplet_tail_idx = 2, \
                                         triplet_edge_idx = 1):
    with open(input_file, 'r') as file_inp, open(output_file, 'w') as file_out,\
    open(output_file2, 'w') as file_out2, open(output_file3, 'w') as file_out3:
        
        final_ent_dict = {}
        final_rel_dict = {}
        inp = csv.reader(file_inp)
        for _ in range(skip_lines):
            next(inp)
        for l in inp:
            to_write = []
            triplet_head, triplet_edge, triplet_tail = \
            int(l[triplet_head_idx]), int(l[triplet_edge_idx]), int(l[triplet_tail_idx])
            if not triplet_head in entity_dict or not triplet_tail in entity_dict or \
            not triplet_edge in edge_dict:
                continue
            rand_int = np.random.randint(200)
            if rand_int > 1:
                write_file_out = file_out
            elif rand_int == 0:
                write_file_out = file_out2
            else:
                write_file_out = file_out3
            write_file_out.write(text_format.format(entity_dict[triplet_head], edge_dict[triplet_edge], entity_dict[triplet_tail]))
            
            final_ent_dict[triplet_head] = entity_dict[triplet_head]
            final_ent_dict[triplet_tail] = entity_dict[triplet_tail]
            final_rel_dict[triplet_edge] = edge_dict[triplet_edge]
        return final_ent_dict, final_rel_dict

In [18]:
triplet_input_file = '../data/wikipages_triplets.csv'#raw_dir + 'wikidata_20190805.qpq_item_statements.csv'
triplet_output_file = to_save_dir + 'train.txt'
triplet_output_file_2 = to_save_dir + 'valid.txt'
triplet_output_file_3 = to_save_dir + 'test.txt'
triplet_text_format = "{}\t{}\t{}\n"
triplet_skip_lines = 1

In [19]:
final_ent_dict, final_rel_dict = write_to_file_with_format_for_triplet(
    input_file=triplet_input_file,
    output_file=triplet_output_file,
    output_file2=triplet_output_file_2,
    output_file3=triplet_output_file_3,
    text_format=triplet_text_format,
    skip_lines=triplet_skip_lines,
)

In [20]:
max(final_ent_dict)

66126524

In [21]:
len(final_ent_dict)

5850119

In [22]:
with open(edge_output_file, 'w') as file_out:
    final_rel_mapping = {}
    for i, (key, val) in enumerate(final_rel_dict.items()):
        final_rel_mapping[i] = key
        file_out.write("{}\t{}\n".format(i, val))

In [23]:
with open(entity_output_file, 'w') as file_out:
    final_ent_mapping = {}
    for i, (key, val) in enumerate(final_ent_dict.items()):
        final_ent_mapping[i] = key
        file_out.write("{}\t{}\n".format(i, val))

In [28]:
final_ent_mapping

{0: 1,
 1: 323,
 2: 837317,
 3: 1208634,
 4: 1079826,
 5: 273508,
 6: 1079806,
 7: 3491753,
 8: 5551050,
 9: 133327,
 10: 221392,
 11: 6999,
 12: 18343,
 13: 79925,
 14: 2504088,
 15: 185674,
 16: 5457948,
 17: 136407,
 18: 11412,
 19: 1133705,
 20: 497745,
 21: 1139177,
 22: 7439451,
 23: 15241043,
 24: 23054687,
 25: 17863945,
 26: 602358,
 27: 2708714,
 28: 338,
 29: 3695190,
 30: 100,
 31: 8307877,
 32: 30,
 33: 2079909,
 34: 54072,
 35: 5456707,
 36: 4947808,
 37: 34600,
 38: 6602,
 39: 1492,
 40: 4970,
 41: 617,
 42: 3141,
 43: 1867,
 44: 243293,
 45: 41621,
 46: 1524,
 47: 34820,
 48: 311975,
 49: 7115475,
 50: 941023,
 51: 771,
 52: 720766,
 53: 15126384,
 54: 941404,
 55: 794927,
 56: 1332459,
 57: 1758000,
 58: 894564,
 59: 49143,
 60: 49142,
 61: 49111,
 62: 49199,
 63: 54170,
 64: 54154,
 65: 54156,
 66: 54151,
 67: 54172,
 68: 49196,
 69: 1758615,
 70: 739428,
 71: 990957,
 72: 1664345,
 73: 1190137,
 74: 48843524,
 75: 15076397,
 76: 1837807,
 77: 27963431,
 78: 902346,
 

In [24]:
import pickle

In [25]:
# storing knowledge graph data
folder = '../../AC297r_2019_Kensho/data/knowledge_graph_data/'

In [29]:
def write_dict_to_pickle(file, dictionary):
    with open(file, 'wb') as handle:
        pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [36]:
file_rel = folder + 'idx2id_edge.pickle'
write_dict_to_pickle(file_rel, final_rel_mapping)

In [37]:
file_ent = folder + 'idx2id_entity.pickle'
write_dict_to_pickle(file_ent, final_ent_mapping)

In [38]:
file_rel_text = folder + 'id2text_edge.pickle'
write_dict_to_pickle(file_rel_text, final_rel_dict)

In [39]:
file_ent_text = folder + 'id2text_entity.pickle'
write_dict_to_pickle(file_ent_text, final_ent_dict)

In [None]:
file_id2idx = folder + 'id2idx_entity.pickle'
id2idx = {value: key for key, value in final_ent_mapping.items()}

with open(file_id2idx, 'wb') as f:
    pickle.dump(id2idx, f)

Now the files are all ready to start training the knowledge graph embedding.

Inside the `dgl/app/kg` folder of https://github.com/dmlc/dgl/tree/master/apps/kg

Please execute the following lines to start training:

```
export DGLBACKEND="pytorch"

python train.py --model DistMult --dataset wiki --batch_size 1024 --neg_sample_size 256 --hidden_dim 2000 --gamma 500.0 --lr 0.1 --max_step 100000 --batch_size_eval 16 --gpu 0 -adv --save_emb emb > log.txt &

```
You may want to try out other parameters by changing `--model`, `--batch_size` and `--max_step`

After the embeddings are trained, please move the embeddings under `emb` folder to `data/knowledge_graph_data`