In [1]:
import os

In [2]:
%run common.ipynb

In [4]:
def explore_training_entities_counts(dataset_path):
    samples_path = get_training_samples_path(dataset_path)
    training_entities = load_entities_of_samples(samples_path)
    print(f"Training entities count: {len(training_entities)}")    
    training_samples = load_matching_samples(
        samples_path, training_entities
    )
    print(f"Training samples count: {len(training_samples)}")

    
def compare_entities_counts_with_training_set(
    dataset_path, samples_filename, dataset_name
):
    samples_path = os.path.join(dataset_path, samples_filename)
    dataset_entities = load_entities_of_samples(samples_path)
    print(f"{dataset_name} entities count: {len(dataset_entities)}")
    dataset_samples = load_matching_samples(
        samples_path, dataset_entities
    )
    print(f"{dataset_name} samples count: {len(dataset_samples)}")  
    training_samples_path = get_training_samples_path(dataset_path)
    training_entities = load_entities_of_samples(training_samples_path)
    pure_dataset_entities = dataset_entities.difference(training_entities)    
    print(
        f"{dataset_name} entities count (excluded training entities): "
        f"{len(pure_dataset_entities)}, (example: "
        f"{next(iter(pure_dataset_entities))})"
    )
    pure_dataset_samples = load_matching_samples(
        samples_path, pure_dataset_entities
    )    
    print(
        f"{dataset_name} samples count (excluded training entities): "
        f"{len(pure_dataset_samples)} (example: "
        f"{next(iter(pure_dataset_samples))})"
    )   


def explore_entities_counts(dataset_path):
    print(f"Dataset path: '{dataset_path}'")    
    entities = load_entities_of_dataset(dataset_path)
    print(f"Total entities count: {len(entities)}")
    print()
    explore_training_entities_counts(dataset_path)
    print()
    validation_samples_path = get_validation_samples_path(dataset_path)
    compare_entities_counts_with_training_set(
        dataset_path,
        VALIDATION_DATASET_FILENAME,
        dataset_name="Validation",
    )
    print()
    test_samples_path = os.path.join(
        dataset_path, TEST_DATASET_FILENAME
    )      
    compare_entities_counts_with_training_set(
        dataset_path,
        TEST_DATASET_FILENAME,
        dataset_name="Test",
    )

## WN18RR

In [6]:
explore_entities_counts(WN18RR_DATASET_PATH)

Dataset path: '../data/WN18RR'
Total entities count: 40943

Training entities count: 40559
Training samples count: 86835

Validation entities count: 5173
Validation samples count: 3034
Validation entities count (excluded training entities): 198, (example: 10260706)
Validation samples count (excluded training entities): 210 (example: ('08687525', '_instance_hypernym', '08685677'))

Test entities count: 5323
Test samples count: 3134
Test entities count (excluded training entities): 209, (example: 00486557)
Test samples count (excluded training entities): 210 (example: ('01831519', '_member_meronym', '01833283'))


In [7]:
explore_relations_counts(WN18RR_DATASET_PATH)

Dataset path: '../data/WN18RR'
Total relations count: 11
Training relations count: 11
Validation relations count: 11
Test relations count: 11


## FB15K-237

In [8]:
explore_entities_counts(FB15K_DATASET_PATH)

Dataset path: '../data/FB15K-237'
Total entities count: 14541

Training entities count: 14505
Training samples count: 272115

Validation entities count: 9809
Validation samples count: 17535
Validation entities count (excluded training entities): 8, (example: /m/01lk31)
Validation samples count (excluded training entities): 9 (example: ('/m/01x4x4', '/common/topic/webpage./common/webpage/category', '/m/08mbj5d'))

Test entities count: 10348
Test samples count: 20466
Test entities count (excluded training entities): 29, (example: /m/07hn5)
Test samples count (excluded training entities): 28 (example: ('/m/02rxd26', '/time/event/instance_of_recurring_event', '/m/07hn5'))


In [9]:
explore_relations_counts(FB15K_DATASET_PATH)

Dataset path: '../data/FB15K-237'
Total relations count: 237
Training relations count: 237
Validation relations count: 223
Test relations count: 224
