In [1]:
import json
import pickle

## Import data map:

In [2]:
path = "../wikidata_tensors/datasets_knowledge_embedding/FB15K/"

with open(path + 'entity2wikidata.json') as json_file:
    data_map = json.load(json_file)

In [3]:
data_map['/m/010016']

{'alternatives': ['Denton, Texas'],
 'description': 'city in Texas, United States',
 'label': 'Denton',
 'wikidata_id': 'Q128306',
 'wikipedia': 'https://en.wikipedia.org/wiki/Denton,_Texas'}

In [4]:
len(data_map)

14515

## Read train, valid, test data:

In [5]:
def file_process(fname, list_of_triples, set_of_entity, set_of_rel, json_map):
    """ 
        Get List of triples with names, set of entity names,
        set of relation names.
    """
    
    with open(fname) as f: 
        for i, line in enumerate(f):
            sub, rel, obj = line.split()
            # why try? -> some sub and obj are not in a json_map!!!! and we skip them
            try:
                set_of_entity.add(json_map[sub]['label'])
                set_of_entity.add(json_map[obj]['label'])
                set_of_rel.add(rel)
                list_of_triples.append((json_map[sub]['label'], rel, json_map[obj]['label']))
            except:
            #    print (i)
                continue
            
    return i + 1

### Train:

In [6]:
list_of_triples_train =[]
set_of_entity_train = set()
set_of_rel_train = set()

file_process(
    path + 'train.txt', list_of_triples_train,
    set_of_entity_train, set_of_rel_train, data_map,
)
print (len(list_of_triples_train), len(set_of_entity_train), len(set_of_rel_train))

460984 14240 1172


### Valid:

In [7]:
list_of_triples_valid =[]
set_of_entity_valid = set()
set_of_rel_valid = set()

file_process(
    path + 'valid.txt', list_of_triples_valid,
    set_of_entity_valid, set_of_rel_valid, data_map,
)
print(len(list_of_triples_valid), len(set_of_entity_valid), len(set_of_rel_valid))

47837 12831 828


### Test:

In [8]:
list_of_triples_test =[]
set_of_entity_test = set()
set_of_rel_test = set()

file_process(
    path + 'test.txt', list_of_triples_test,
    set_of_entity_test, set_of_rel_test, data_map,
)
print(len(list_of_triples_test), len(set_of_entity_test), len(set_of_rel_test))

56396 13090 850


## Check entity and relation sets:

### Erase repeating triples in a concrete set:

#### Train:

In [9]:
train = set()
for triple in list_of_triples_train:
    train.add(triple)

len(train)

459708

#### Valid:

In [10]:
valid = set()
for triple in list_of_triples_valid:
    valid.add(triple)
    
len(valid)

47823

#### Test:

In [11]:
test = set()
for triple in list_of_triples_test:
    test.add(triple)
    
len(test)

56370

### Erase repeated triples in train/test/valid:

#### Train/Valid:

In [12]:
len(train.intersection(valid))

245

In [13]:
train = train.difference(valid)
len(train)

459463

In [14]:
len(train.intersection(valid))

0

#### Train/Test:

In [15]:
len(train.intersection(test))

254

In [16]:
train = train.difference(test)
len(train)

459209

In [17]:
len(train.intersection(test))

0

#### Valid/Test:

In [18]:
len(valid.intersection(test))

32

In [19]:
valid = valid.difference(test)
len(valid)

47791

In [20]:
len(valid.intersection(test))

0

#### All intersection?

In [21]:
len(train.intersection(valid).intersection(test))

0

#### Make lists of train/valid/test:

In [22]:
list_of_triples_train = list(train)
len(list_of_triples_train)

459209

In [23]:
list_of_triples_valid = list(valid)
len(list_of_triples_valid)

47791

In [24]:
list_of_triples_test = list(test)
len(list_of_triples_test)

56370

### Erase "cold" triples based on entities and relations:

#### train/test:

In [25]:
diff_ent_test_train = set_of_entity_test.difference(set_of_entity_train)
diff_ent_test_train

{'Lasker-DeBakey Clinical Medical Research Award'}

In [26]:
diff_rel_test_train = set_of_rel_test.difference(set_of_rel_train)
diff_rel_test_train

{'/architecture/type_of_museum/museums',
 '/base/patronage/patron/related_client./base/patronage/patron_client_relationship/client',
 '/base/thoroughbredracing/thoroughbred_racehorse/color',
 '/biology/organism/organism_type',
 '/biology/organism_classification/organisms_of_this_type',
 '/comic_books/comic_book_character/regular_featured_appearances',
 '/film/film_character/portrayed_in_films./film/performance/special_performance_type',
 '/film/film_character/portrayed_in_films_dubbed./film/dubbing_performance/film',
 '/royalty/royal_line/monarchs_from_this_line'}

In [27]:
one = [triple for triple in list_of_triples_test if triple[0] not in diff_ent_test_train]
two = [triple for triple in one if triple[1] not in diff_rel_test_train]
list_of_triples_test_updated = [triple for triple in two if triple[2] not in diff_ent_test_train]
len(list_of_triples_test_updated)

56361

#### train/valid:

In [28]:
diff_ent_valid_train = set_of_entity_valid.difference(set_of_entity_train)
diff_ent_valid_train

set()

In [29]:
diff_rel_valid_train = set_of_rel_valid.difference(set_of_rel_train)
diff_rel_valid_train

{'/fictional_universe/fictional_job_title/fictional_characters_with_this_job./fictional_universe/fictional_employment_tenure/employee',
 '/royalty/royal_line/monarchs_from_this_line'}

In [30]:
one = [triple for triple in list_of_triples_valid if triple[0] not in diff_ent_valid_train]
two = [triple for triple in one if triple[1] not in diff_rel_valid_train]
list_of_triples_valid_updated = [triple for triple in two if triple[2] not in diff_ent_valid_train]
len(list_of_triples_valid_updated)

47788

## Let's look at the data:

In [31]:
list_of_triples_train[:2]

[('marriage',
  '/people/marriage_union_type/unions_of_this_type./people/marriage/spouse',
  'James Spader'),
 ('National Heads-Up Poker Championship', '/tv/tv_program/genre', 'sport')]

In [32]:
list_of_triples_valid_updated[:2]

[('keyboard instrument',
  '/music/performance_role/regular_performances./music/group_membership/group',
  'Grateful Dead'),
 ('David Gilmour', '/music/group_member/instruments_played', 'bass guitar')]

In [33]:
list_of_triples_test_updated[:2]

[('Chazz Palminteri',
  '/award/award_winner/awards_won./award/award_honor/award_winner',
  'Channing Tatum'),
 ('Ursula K. Le Guin',
  '/people/person/places_lived./people/place_lived/location',
  'Berkeley')]

In [34]:
list(set_of_entity_train)[:3]

['Northridge', 'Croatian American', 'Bob Clampett']

In [35]:
list(set_of_rel_train)[:3]

['/people/person/children',
 '/location/statistical_region/rent50_1./measurement_unit/dated_money_value/currency',
 '/base/locations/countries/continent']

In [36]:
len(set_of_entity_train)

14240

In [37]:
len(set_of_rel_train)

1172

## Create mapping from names to numerical identifiers:

In [38]:
# Attach all the entities with their numerical identifiers:
entity_to_ind = {elem: ind for ind, elem in enumerate(list(set_of_entity_train))}

# Make a list of entities from a set:
ind_to_entity = list(set_of_entity_train)

In [39]:
# Attach all the relations with their numerical identifiers:
rel_to_ind = { elem : ind for ind, elem in enumerate(list(set_of_rel_train))}

# Make a list of relations from a set:
ind_to_relation = list(set_of_rel_train)

In [40]:
def create_list_of_inds(list_of_triples, rel_to_ind, entity_to_ind):
    answer_list = [(entity_to_ind[elem[0]], rel_to_ind[elem[1]], entity_to_ind[elem[2]]) for elem in list_of_triples]
    return answer_list

### Train triples:

In [41]:
inds_list_train = create_list_of_inds(list_of_triples_train, rel_to_ind, entity_to_ind)
len(inds_list_train)

459209

### Valid triples:

In [42]:
inds_list_valid = create_list_of_inds(list_of_triples_valid_updated, rel_to_ind, entity_to_ind)
len(inds_list_valid)

47788

### Test triples:

In [43]:
inds_list_test = create_list_of_inds(list_of_triples_test_updated, rel_to_ind, entity_to_ind)
len(inds_list_test) 

56361

### Save data triples:

In [44]:
path = "Link_Prediction_Data/FB15K/"
pickle.dump(inds_list_train, open(path + 'train_triples', 'wb'))
pickle.dump(inds_list_valid, open(path + 'valid_triples', 'wb'))
pickle.dump(inds_list_test, open(path + 'test_triples', 'wb'))

pickle.dump(ind_to_entity, open(path + 'entity_list', 'wb'))
pickle.dump(ind_to_relation, open(path + 'relation_list', 'wb'))

pickle.dump(entity_to_ind, open(path + 'entity_map', 'wb'))
pickle.dump(rel_to_ind, open(path + 'relation_map', 'wb'))