Descriptive statistics/sanity checks on generated synthetic TKGs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

from collections import defaultdict
from itertools import combinations, product

import os
import random
import re

In [2]:
from temporalpattern import TemporalPattern

# Load synthetic TKG

In [3]:
data_dir = '/nas/ckgfs/users/eboxer/synthetic_tkg_patterns'
experiments = [
    'EntDistr-Unif_RelDistr-Unif',
    'EntDistr-Unif_RelDistr-Long',
    'EntDistr-Long_RelDistr-Unif',
    'EntDistr-Long_RelDistr-Long',
    '1hop-25_2hop-100_3hop-400',
    '1hop-50_2hop-100_3hop-200',
    '1hop-200_2hop-100_3hop-50',
    '1hop-400_2hop-100_3hop-25',
]
experiment_dirs = [
    os.path.join(data_dir, experiment) for experiment in experiments
]

In [4]:
def describe_run(run_path: str):
    # Patterns
    pattern2id = pd.read_csv(
        os.path.join(run_path, 'pattern2id.txt'),
        sep='\t',
        names=['pattern', 'n_hops', 'id']
    )
    pat_ents = []
    pat_rels = []
    for label in pattern2id['pattern']:
        pattern = TemporalPattern()
        pattern.from_label(label)
        for triple in pattern.__triples__():
            pat_ents.append(triple[0])
            pat_rels.append(triple[1])
            pat_ents.append(triple[2])
    pat_ents = pd.Series(pat_ents)
    pat_rels = pd.Series(pat_rels)

    def describe_edgelist(df: pd.DataFrame):
        return df.shape[0], df['pattern']
    
    train = pd.read_csv(
        os.path.join(run_path, 'train.txt'),
        sep='\t',
        names=['head', 'rel', 'tail', 't', 'pattern']
    )
    valid = pd.read_csv(
        os.path.join(run_path, 'valid.txt'),
        sep='\t',
        names=['head', 'rel', 'tail', 't', 'pattern']
    )
    test = pd.read_csv(
        os.path.join(run_path, 'test.txt'),
        sep='\t',
        names=['head', 'rel', 'tail', 't', 'pattern']
    )
    n_train, pats_train = describe_edgelist(train)
    n_valid, pats_valid = describe_edgelist(valid)
    n_test, pats_test = describe_edgelist(test)

    return pat_ents, pat_rels, \
        n_train, pats_train, \
        n_valid, pats_valid, \
        n_test, pats_test

In [5]:
pats_ents, pats_rels = [], []
n_trains, pats_trains = [], []
n_valids, pats_valids = [], []
n_tests, pats_tests = [], []
runs = []
experiment_names = []
for experiment, experiment_dir in zip(experiments, experiment_dirs):
    for run_idx in range(10):
        run_path = os.path.join(experiment_dir, f'run_{run_idx}')
        pat_ents, pat_rels, n_train, pats_train, n_valid, pats_valid, n_test, pats_test = \
            describe_run(run_path)
        pats_ents.append(pat_ents)
        pats_rels.append(pat_rels)
        n_trains.append(n_train)
        pats_trains.append(pats_train)
        n_valids.append(n_valid)
        pats_valids.append(pats_valid)
        n_tests.append(n_test)
        pats_tests.append(pats_test)
        runs.append(f'run_{run_idx}')
        experiment_names.append(experiment)
df_experiments = pd.DataFrame({
    'experiment': experiment_names,
    'runs': runs,
    'pat_ents': pats_ents,
    'pat_rels': pats_rels,
    'n_train': n_trains,
    'n_valid': n_valids,
    'n_test': n_tests,
    'pats_train': pats_trains,
    'pats_valid': pats_valids,
    'pats_test': pats_tests,
})

In [6]:
df_experiments[:3]

Unnamed: 0,experiment,runs,pat_ents,pat_rels,n_train,n_valid,n_test,pats_train,pats_valid,pats_test
0,EntDistr-Unif_RelDistr-Unif,run_0,0 4576 1 2802 2 2609 3 ...,0 125 1 39 2 25 3 25 4 ...,1527907,188454,193505,0 [-1] 1 [-1] 2 [-1...,0 [-1] 1 [-1] 2 [-1] 3...,0 [-1] 1 [-1] 2 [-1] 3...
1,EntDistr-Unif_RelDistr-Unif,run_1,0 2329 1 3379 2 2717 3 ...,0 154 1 177 2 113 3 177 4 ...,1527032,188743,193564,0 [-1] 1 [-1] 2 [-1...,0 [-1] 1 [-1] 2 [-1] 3...,0 [-1] 1 [-1] 2 [-1] 3...
2,EntDistr-Unif_RelDistr-Unif,run_2,0 4226 1 2191 2 4243 3 ...,0 92 1 45 2 170 3 170 4 ...,1528017,188313,193535,0 [-1] 1 [-1] 2 [-1...,0 [-1] 1 [-1] 2 [-1] 3...,0 [-1] 1 [-1] 2 [-1] 3...


## Distribution over patterns

In [7]:
df_experiments.loc[:,'n_most_common_pattern_train'] = df_experiments['pats_train'].apply(
    lambda x: x.explode().value_counts().iloc[1])  # Excluding -1
df_experiments.loc[:,'n_most_common_pattern_valid'] = df_experiments['pats_valid'].apply(
    lambda x: x.explode().value_counts().iloc[1])  # Excluding -1
df_experiments.loc[:,'n_most_common_pattern_test'] = df_experiments['pats_test'].apply(
    lambda x: x.explode().value_counts().iloc[1])  # Excluding -1

df_experiments.loc[:,'n_least_common_pattern_train'] = df_experiments['pats_train'].apply(
    lambda x: x.explode().value_counts().iloc[-1])
df_experiments.loc[:,'n_least_common_pattern_valid'] = df_experiments['pats_valid'].apply(
    lambda x: x.explode().value_counts().iloc[-1])
df_experiments.loc[:,'n_least_common_pattern_test'] = df_experiments['pats_test'].apply(
    lambda x: x.explode().value_counts().iloc[-1])

df_experiments.loc[:,'n_mean_pattern_train'] = df_experiments['pats_train'].apply(
    lambda x: x.explode().value_counts().mean())
df_experiments.loc[:,'n_mean_pattern_valid'] = df_experiments['pats_valid'].apply(
    lambda x: x.explode().value_counts().mean())
df_experiments.loc[:,'n_mean_pattern_test'] = df_experiments['pats_test'].apply(
    lambda x: x.explode().value_counts().mean())


In [8]:
df_experiments.groupby(['experiment']).agg({
    'n_most_common_pattern_train': ['mean'],
    'n_mean_pattern_train': ['mean'],
    'n_least_common_pattern_train': ['mean'],

    'n_most_common_pattern_valid': ['mean'],
    'n_mean_pattern_valid': ['mean'],
    'n_least_common_pattern_valid': ['mean'],
    
    'n_most_common_pattern_test': ['mean'],
    'n_mean_pattern_test': ['mean'],
    'n_least_common_pattern_test': ['mean'],
})

Unnamed: 0_level_0,n_most_common_pattern_train,n_mean_pattern_train,n_least_common_pattern_train,n_most_common_pattern_valid,n_mean_pattern_valid,n_least_common_pattern_valid,n_most_common_pattern_test,n_mean_pattern_test,n_least_common_pattern_test
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean
experiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1hop-200_2hop-100_3hop-50,7240.0,4377.088134,35.3,932.8,547.981578,1.0,1020.2,568.499928,1.9
1hop-25_2hop-100_3hop-400,22504.5,3263.229924,8.3,2653.8,410.390496,1.4,3716.4,421.207333,1.9
1hop-400_2hop-100_3hop-25,5579.5,3080.863241,16.0,705.3,385.897117,1.5,890.1,397.624845,2.1
1hop-50_2hop-100_3hop-200,19424.5,4464.971023,23.0,2401.8,556.847617,1.5,2748.1,576.889691,1.9
EntDistr-Long_RelDistr-Long,11981.7,5053.296259,49.1,1486.9,629.429228,1.3,1657.8,649.201951,2.1
EntDistr-Long_RelDistr-Unif,11884.0,5053.453796,44.5,1469.1,628.827765,1.2,1654.1,647.650629,2.8
EntDistr-Unif_RelDistr-Long,11804.2,5056.46573,47.7,1446.2,632.523394,1.3,1654.0,650.612265,1.5
EntDistr-Unif_RelDistr-Unif,12132.1,5052.820904,41.2,1492.4,631.901591,1.6,1689.4,647.704095,1.8


## Distribution over entities/relations in patterns

In [9]:
df_experiments.loc[:,'n_most_common_entity'] = df_experiments['pat_ents'].apply(
    lambda x: x.value_counts().iloc[0])
df_experiments.loc[:,'n_mean_entity'] = df_experiments['pat_ents'].apply(
    lambda x: x.value_counts().mean())
df_experiments.loc[:,'n_least_common_entity'] = df_experiments['pat_ents'].apply(
    lambda x: x.value_counts().iloc[-1])
df_experiments.loc[:,'n_most_common_relation'] = df_experiments['pat_rels'].apply(
    lambda x: x.value_counts().iloc[0])
df_experiments.loc[:,'n_mean_relation'] = df_experiments['pat_rels'].apply(
    lambda x: x.value_counts().mean())
df_experiments.loc[:,'n_least_common_relation'] = df_experiments['pat_rels'].apply(
    lambda x: x.value_counts().iloc[-1])

In [10]:
df_experiments.groupby(['experiment']).agg({
    'n_most_common_entity': ['mean'],
    'n_mean_entity': ['mean'],
    'n_least_common_entity': ['mean'],
    'n_most_common_relation': ['mean'],
    'n_mean_relation': ['mean'],
    'n_least_common_relation': ['mean'],
})

Unnamed: 0_level_0,n_most_common_entity,n_mean_entity,n_least_common_entity,n_most_common_relation,n_mean_relation,n_least_common_relation
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean
experiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1hop-200_2hop-100_3hop-50,8.6,2.260976,1.0,13.9,4.77775,1.0
1hop-25_2hop-100_3hop-400,10.1,2.43839,1.0,21.4,9.754899,1.4
1hop-400_2hop-100_3hop-25,8.9,2.30802,1.0,16.4,6.205165,1.0
1hop-50_2hop-100_3hop-200,8.7,2.309185,1.0,15.7,6.082329,1.0
EntDistr-Long_RelDistr-Long,29.3,3.767608,1.0,33.5,6.166991,1.0
EntDistr-Long_RelDistr-Unif,22.1,3.782325,1.0,13.4,4.722379,1.0
EntDistr-Unif_RelDistr-Long,7.9,2.259198,1.0,29.4,5.991074,1.0
EntDistr-Unif_RelDistr-Unif,8.2,2.264876,1.0,13.9,4.747815,1.0


## # of edges per split

In [11]:
df_experiments[['n_train', 'n_valid', 'n_test']].describe()

Unnamed: 0,n_train,n_valid,n_test
count,80.0,80.0,80.0
mean,1570801.0,193813.9125,199195.6875
std,65257.23,8085.687379,8315.851072
min,1524716.0,187949.0,193157.0
25%,1527387.0,188452.75,193670.25
50%,1534572.0,189325.5,194605.5
75%,1585720.0,195916.25,201304.25
max,1721519.0,212694.0,218646.0
