In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy.stats as st
import numpy as np
import scipy.stats as st
from cycler import cycler
import matplotlib as mpl


# Evaluation Times

In [None]:
# Load the Ground Truth:
dataset = "swdf"
query_type = "star"
methods = ['GNCE', 'LMKG', 'lss', 'wj', 'impr', 'jsub', 'cset']

In [None]:
method_labels = ['GNCE', 'LMKG', 'LSS', 'Wanderjoin', 'impr', 'jsub', 'CSET']

In [None]:
# Flag whether to save the plots
save_plots = True

In [None]:
# Whether to exclude queries with cardinality 0, -1
exclude_invalid = True

In [None]:
## Filtering out -1.0 and 0.0 predictions:


def load_approach(dataset, query_type, approach):
    # Load data as before
    gts = np.load(f"/home/tim/Datasets/{dataset}/Results/{query_type}/{approach}/gts.npy")
    preds = np.load(f"/home/tim/Datasets/{dataset}/Results/{query_type}/{approach}/preds.npy")
    sizes = np.load(f"/home/tim/Datasets/{dataset}/Results/{query_type}/{approach}/sizes.npy")
    exec_times = np.load(f"/home/tim/Datasets/{dataset}/Results/{query_type}/{approach}/pred_times.npy")

    # Create a mask for the valid entries in preds
    if exclude_invalid:
        mask = (preds != 0.0) & (preds != -1.0)
        mask = (preds > 0)

        # Apply the mask to all arrays
        gts = gts[mask]
        preds = preds[mask]
        sizes = sizes[mask]
        exec_times = exec_times[mask]

    # Continue your logic as before, but using the filtered arrays
    if approach in ["GNCE", "LMKG", "lss"]:
        exec_times_total = np.load(f"/home/tim/Datasets/{dataset}/Results/{query_type}/{approach}/pred_times_total.npy")
        if exclude_invalid:
            exec_times_total = exec_times_total[mask]  # Apply mask to exec_times_total as well
        loading_times = exec_times_total - exec_times
        loading_times = [np.mean(loading_times[sizes ==2]), np.mean(loading_times[sizes ==3]),
        np.mean(loading_times[sizes ==5]),
       np.mean(loading_times[sizes ==8])]
        
    else:
        exec_times_total = exec_times
        sizes = sizes[:]
        loading_times = [0,0,0,0]


    runtimes = [np.mean(exec_times[sizes ==2]), np.mean(exec_times[sizes ==3]),
        np.mean(exec_times[sizes ==5]),
       np.mean(exec_times[sizes ==8])]


    mins = [st.t.interval(0.95, len(exec_times[sizes ==i])-1, loc=np.mean(exec_times[sizes ==i]),
              scale=st.sem(exec_times[sizes ==i]))[0] for i in (2,3,5,8)]

    maxs = [st.t.interval(0.95, len(exec_times[sizes ==i])-1, loc=np.mean(exec_times[sizes ==i]),
          scale=st.sem(exec_times[sizes ==i]))[1] for i in (2,3,5,8)]

    return runtimes, loading_times, mins, maxs, 
    

In [None]:
data = []
data_loading = []
min_values = []
max_values = []

for method in methods:
    time, loading_time, minval, maxval = load_approach(dataset, query_type, method)
    data.append(time)
    data_loading.append(loading_time)
    min_values.append(minval)
    max_values.append(maxval)

In [None]:

bar_cycle = (cycler('hatch', ['//////', '-----', '...','oooo', 'xxx','**', 'OOO', '\\\\']))
styles = bar_cycle()

colormap = mpl.cm.tab10.colors

In [None]:
X_Axis = np.arange(len(data[0]))

labels = methods
datamin = min_values
datamax = max_values

min_offset = int(len(data)/2) * 0.1

for i in range(len(data)):
    hatch_color = list(colormap[i])
    plt.rcParams['hatch.color'] = colormap[i]
    mins = list(np.array(data[i]) - np.array(datamin[i]))
    maxs = list(np.array(datamax[i]) - np.array(data[i]))

    errs = np.array([[mins[j], maxs[j]] for j in range(len(datamax[i]))]).T
    plt.bar(X_Axis - min_offset + i*0.1, data[i], 
            0.1, label= labels[i],
            color=colormap[i], alpha=0.7,
            yerr = errs,
            capsize=5,
            ecolor = 'darkgrey',
            **next(styles))


#plt.bar(X_Axis - 0.1,GNCE, 0.1, label="GNCE")
#plt.bar(X_Axis - 0.0,LMKG, 0.1, label="LMKG")
#plt.bar(X_Axis + 0.1,LSS, 0.1, labAel="LSS")
plt.legend(prop={'size': 6})
plt.yscale("log")
plt.xticks(X_Axis, ["2", "3", "5", "8"])
plt.xlabel("Query Size")
plt.ylabel("Mean Execution Time [ms]")
plt.ylim(0,6500)
plt.savefig(dataset+ "_"+ query_type + "_execution_times.pdf")

## Code for splitting bars 

In [None]:
X_Axis = np.arange(len(data[0]))

#data = [GNCE, LMKG, WJOIN, IMPR, JSUB, CSET]
#data2 = [GNCE_load, LMKG_load, WJOIN_load, IMPR_load, JSUB_load, CSET_load]
data2 = data_loading

labels = method_labels
labels2 = [method + '  loading' for method in method_labels]

datamin = min_values
datamax = max_values

min_offset = int(len(data)/2) * 0.1

fig,ax = plt.subplots()

for i in range(len(data)):
    hatch_color = list(colormap[i])
    plt.rcParams['hatch.color'] = colormap[i]
    mins = list(np.array(data[i]) - np.array(datamin[i]))
    maxs = list(np.array(datamax[i]) - np.array(data[i]))

    errs = np.array([[mins[j], maxs[j]] for j in range(len(datamax[i]))]).T
    ax.bar(X_Axis - min_offset + i*0.1, data[i], 
            0.1, label= labels[i],
            color=colormap[i], alpha=0.7,
            yerr = errs,
            capsize=5,

            ecolor = 'darkgrey',
            **next(styles))
    ax.bar(X_Axis - min_offset + i*0.1, data2[i], 
            0.1, label= labels2[i],
            color=colormap[i], alpha=0.7,
            capsize=5,
            bottom=data[i],
            ecolor = 'darkgrey',
            **next(styles))


#plt.bar(X_Axis - 0.1,GNCE, 0.1, label="GNCE")
#plt.bar(X_Axis - 0.0,LMKG, 0.1, label="LMKG")
#plt.bar(X_Axis + 0.1,LSS, 0.1, labAel="LSS")
plt.legend(prop={'size': 6}, ncol=2)
plt.yscale("log")
plt.xticks(X_Axis, ["2", "3", "5", "8"])
plt.xlabel("Query Size")
plt.ylabel("Mean Execution Time [ms]")
plt.ylim(0,6500)
if save_plots:
    print(f"Saving Execution plot..")
    plt.savefig(f"/home/tim/Datasets/plots/{dataset}_{query_type}_execution_times.pdf")
#plt.savefig(dataset+ "_"+ query_type + "_execution_times.pdf")

# Total Training Times per atom

In [None]:
base_folder = '/home/tim/Datasets'
dataset = 'yago'

In [None]:
# Embedding time for prone - lss
# Prone on Yago took a total of 51216.82 seconds
# for a total of 13000080 embeddings
prone_embedding_time_per_atom = 51216.82/13000080 * 1000

In [None]:
methods = ['GNCE', 'LMKG', 'lss', 'wj', 'impr', 'jsub', 'cset']
total_runtimes_per_atom = []
total_embedding_times_per_atom = [6.404609663545492, 0, prone_embedding_time_per_atom, 0 ,0 ,0 ,0]

In [None]:
for method in methods:
    with open(f'{base_folder}/{dataset}/Results/training_timing/{method}/training_timing.json','r') as f:
        data = json.load(f)
        total_runtimes_per_atom.append(data['total_training_time_per_atom'])

In [None]:
total_embedding_times_per_atom

In [None]:
total_runtimes_per_atom

In [None]:
colormap = mpl.cm.tab10.colors
bar_cycle = (cycler('hatch', ['//////', '-----', '...','oooo', 'xxx','**', 'OOO', '\\\\']))
styles = bar_cycle()

bars = plt.bar(methods, total_runtimes_per_atom, color=colormap)
plt.yscale("log")
# Apply hatch styles from the cycler
for bar, style in zip(bars, styles):
    bar.set_hatch(style['hatch'])


# Add labels and title
plt.xlabel('Method')
plt.ylabel('Runtime per atom [ms]')
plt.title('Total Runtimes Per Atom by Method')

# Show the plot
#plt.show()

if save_plots:
    print(f"Saving Training plot..")
    plt.savefig(f"/home/tim/Datasets/plots/{dataset}_{query_type}_training_times.pdf")


In [None]:
import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib as mpl


colormap = mpl.cm.tab10.colors
bar_cycle = (cycler('hatch', ['//////', '-----', '...', 'oooo', 'xxx', '**', 'OOO', '\\\\']))
styles = bar_cycle()

plt.rcParams['hatch.color'] = 'grey'
#bars1 = plt.bar(methods, total_runtimes_per_atom, color=colormap, alpha=0.7)
#bars2 = plt.bar(methods, total_embedding_times_per_atom, color='gray', alpha=0.5, bottom=total_runtimes_per_atom)

bars1 = plt.bar(methods, total_embedding_times_per_atom, color='gray', alpha=0.5, label='Embedding')
bars2 = plt.bar(methods, total_runtimes_per_atom, color=colormap, alpha=0.7, bottom=total_embedding_times_per_atom)

plt.yscale("log")

# Apply hatch styles from the cycler to the first set of bars
#for bar, style in zip(bars1, styles):
#    bar.set_hatch(style['hatch'])

# Add labels and title
plt.xlabel('Method')
plt.ylim(0.0001)
plt.ylabel('Training Time per atom [ms]')
plt.legend()
#plt.title('Total Runtimes Per Atom by Method')

# Show the plot or save it
if save_plots:
    print(f"Saving Training plot..")
    plt.savefig(f"/home/tim/Datasets/plots/{dataset}_{query_type}_training_times.pdf")
else:
    plt.show()


In [None]:
import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib as mpl

epsilon = 1e-9  # Or some small value
colormap = mpl.cm.tab10.colors
bar_cycle = (cycler('hatch', ['//////', '-----', '...', 'oooo', 'xxx', '**', 'OOO', '\\\\']))
styles = bar_cycle()

# Making sure we don't have zero values
total_embedding_times_per_atom = [max(epsilon, x) for x in total_embedding_times_per_atom]

bars1 = plt.bar(methods, total_runtimes_per_atom, color=colormap)
bars2 = plt.bar(methods, total_embedding_times_per_atom, color='gray', alpha=0.5, bottom=total_runtimes_per_atom)
bars3 = plt.bar(methods, total_runtimes_per_atom, color=colormap)

plt.yscale("log")

# Apply hatch styles from the cycler to the first set of bars
for bar, style in zip(bars1, styles):
    bar.set_hatch(style['hatch'])

# Add labels and title
plt.xlabel('Method')
plt.ylabel('Runtime per atom [ms]')
plt.title('Total Runtimes Per Atom by Method')

# Show the plot or save it
if save_plots:
    print(f"Saving Training plot..")
    plt.savefig(f"/home/tim/Datasets/plots/{dataset}_{query_type}_training_times.pdf")
else:
    plt.show()
