In [40]:
import json
import pickle

## Import data map:

In [41]:
path = "../wikidata_tensors/datasets_knowledge_embedding/FB15k-237/"

with open(path + 'entity2wikidata.json') as json_file:
    data_map = json.load(json_file)

In [42]:
data_map['/m/010016']

{'alternatives': ['Denton, Texas'],
 'description': 'city in Texas, United States',
 'label': 'Denton',
 'wikidata_id': 'Q128306',
 'wikipedia': 'https://en.wikipedia.org/wiki/Denton,_Texas'}

In [43]:
len(data_map)

14515

## Read train, valid, test data:

In [44]:
def file_process(fname, list_of_triples, set_of_entity, set_of_rel, json_map):
    """ 
        Get List of triples with names, set of entity names,
        set of relation names.
    """
    
    with open(fname) as f: 
        for i, line in enumerate(f):
            sub, rel, obj = line.split()
            # why try? -> some sub and obj are not in a json_map!!!! and we skip them
            try:
                set_of_entity.add(json_map[sub]['label'])
                set_of_entity.add(json_map[obj]['label'])
                set_of_rel.add(rel)
                list_of_triples.append((json_map[sub]['label'], rel, json_map[obj]['label']))
            except:
            #    print (i)
                continue
            
    return i + 1

### Train:

In [45]:
list_of_triples_train =[]
set_of_entity_train = set()
set_of_rel_train = set()

file_process(
    path + 'train.txt', list_of_triples_train,
    set_of_entity_train, set_of_rel_train, data_map,
)
print (len(list_of_triples_train), len(set_of_entity_train), len(set_of_rel_train))

267189 14206 235


### Valid:

In [46]:
list_of_triples_valid =[]
set_of_entity_valid = set()
set_of_rel_valid = set()

file_process(
    path + 'valid.txt', list_of_triples_valid,
    set_of_entity_valid, set_of_rel_valid, data_map,
)
print(len(list_of_triples_valid), len(set_of_entity_valid), len(set_of_rel_valid))

17087 9689 221


### Test:

In [47]:
list_of_triples_test =[]
set_of_entity_test = set()
set_of_rel_test = set()

file_process(
    path + 'test.txt', list_of_triples_test,
    set_of_entity_test, set_of_rel_test, data_map,
)
print(len(list_of_triples_test), len(set_of_entity_test), len(set_of_rel_test))

19929 10215 222


## Check entity and relation sets:

### Erase repeating triples in a concrete set:

#### Train:

In [48]:
train = set()
for triple in list_of_triples_train:
    train.add(triple)

len(train)

266655

#### Valid:

In [49]:
valid = set()
for triple in list_of_triples_valid:
    valid.add(triple)
    
len(valid)

17080

#### Test:

In [50]:
test = set()
for triple in list_of_triples_test:
    test.add(triple)
    
len(test)

19922

### Erase repeated triples in train/test/valid:

#### Train/Valid:

In [51]:
len(train.intersection(valid))

83

In [52]:
train = train.difference(valid)
len(train)

266572

In [53]:
len(train.intersection(valid))

0

#### Train/Test:

In [54]:
len(train.intersection(test))

87

In [55]:
train = train.difference(test)
len(train)

266485

In [56]:
len(train.intersection(test))

0

#### Valid/Test:

In [57]:
len(valid.intersection(test))

9

In [58]:
valid = valid.difference(test)
len(valid)

17071

In [59]:
len(valid.intersection(test))

0

#### All intersection?

In [60]:
len(train.intersection(valid).intersection(test))

0

#### Make lists of train/valid/test:

In [61]:
list_of_triples_train = list(train)
len(list_of_triples_train)

266485

In [62]:
list_of_triples_valid = list(valid)
len(list_of_triples_valid)

17071

In [63]:
list_of_triples_test = list(test)
len(list_of_triples_test)

19922

### Erase "cold" triples based on entities and relations:

#### train/test:

In [64]:
diff_ent_test_train = set_of_entity_test.difference(set_of_entity_train)
diff_ent_test_train

{'2008 Tour de France',
 '2009 Tour de France',
 'Australian Greens',
 'Australian Labor Party',
 'James E. Sullivan Award',
 'Kosi Zone',
 'Lasker-DeBakey Clinical Medical Research Award',
 'Nazism',
 'Northern Dancer',
 'Ocean Software',
 'Ontario New Democratic Party',
 'Padma Vibhushan',
 'Pakistan Peoples Party',
 'Pulitzer Prize for Music',
 'South Island',
 'Spinefarm Records',
 'Tour de France',
 'United National Party',
 'Vice President of the United States',
 'association football manager',
 'asteroid',
 'commissioner',
 'curator',
 'geometry',
 'infectious disease',
 'natural history',
 'organization',
 'productivity'}

In [65]:
diff_rel_test_train = set_of_rel_test.difference(set_of_rel_train)
diff_rel_test_train

set()

In [66]:
one = [triple for triple in list_of_triples_test if triple[0] not in diff_ent_test_train]
two = [triple for triple in one if triple[1] not in diff_rel_test_train]
list_of_triples_test_updated = [triple for triple in two if triple[2] not in diff_ent_test_train]
len(list_of_triples_test_updated)

19902

#### train/valid:

In [67]:
diff_ent_valid_train = set_of_entity_valid.difference(set_of_entity_train)
diff_ent_valid_train

{'Daffy Duck',
 'Governor of Massachusetts',
 'New York State Senate',
 'SkyTeam',
 'South Island',
 'mercenary',
 'museum',
 'planet'}

In [68]:
diff_rel_valid_train = set_of_rel_valid.difference(set_of_rel_train)
diff_rel_valid_train

set()

In [69]:
one = [triple for triple in list_of_triples_valid if triple[0] not in diff_ent_valid_train]
two = [triple for triple in one if triple[1] not in diff_rel_valid_train]
list_of_triples_valid_updated = [triple for triple in two if triple[2] not in diff_ent_valid_train]
len(list_of_triples_valid_updated)

17065

## Let's look at the data:

In [70]:
len(set_of_entity_train)

14206

In [71]:
len(set_of_rel_train)

235

## Create mapping from names to numerical identifiers:

In [72]:
# Attach all the entities with their numerical identifiers:
entity_to_ind = {elem: ind for ind, elem in enumerate(list(set_of_entity_train))}

# Make a list of entities from a set:
ind_to_entity = list(set_of_entity_train)

In [73]:
# Attach all the relations with their numerical identifiers:
rel_to_ind = { elem : ind for ind, elem in enumerate(list(set_of_rel_train))}

# Make a list of relations from a set:
ind_to_relation = list(set_of_rel_train)

In [74]:
def create_list_of_inds(list_of_triples, rel_to_ind, entity_to_ind):
    answer_list = [(entity_to_ind[elem[0]], rel_to_ind[elem[1]], entity_to_ind[elem[2]]) for elem in list_of_triples]
    return answer_list

### Train triples:

In [75]:
inds_list_train = create_list_of_inds(list_of_triples_train, rel_to_ind, entity_to_ind)
len(inds_list_train)

266485

### Valid triples:

In [76]:
inds_list_valid = create_list_of_inds(list_of_triples_valid_updated, rel_to_ind, entity_to_ind)
len(inds_list_valid)

17065

### Test triples:

In [77]:
inds_list_test = create_list_of_inds(list_of_triples_test_updated, rel_to_ind, entity_to_ind)
len(inds_list_test) 

19902

### Save data triples:

In [78]:
path = "Link_Prediction_Data/FB15K237/"
pickle.dump(inds_list_train, open(path + 'train_triples', 'wb'))
pickle.dump(inds_list_valid, open(path + 'valid_triples', 'wb'))
pickle.dump(inds_list_test, open(path + 'test_triples', 'wb'))

pickle.dump(ind_to_entity, open(path + 'entity_list', 'wb'))
pickle.dump(ind_to_relation, open(path + 'relation_list', 'wb'))

pickle.dump(entity_to_ind, open(path + 'entity_map', 'wb'))
pickle.dump(rel_to_ind, open(path + 'relation_map', 'wb'))