In [None]:
from benchmark import Benchmarker
import os

# folders containing the relevant benchmarks
align_benchmark = os.path.join('Benchmark', 'Align Benchmark')
integration_benchmark = os.path.join('Benchmark', 'Real Benchmark')

# create and run all of the benchmarks
benchmarker = Benchmarker()
benchmarker.RunBenchmarks(align_benchmark, None)

In [None]:
import pickle

with open("benchmarker.bin", 'wb+') as f:
    pickle.dump(benchmarker, f)

In [None]:
# visualize the runtime of ALITE across multiple datasets
benchmarker.VisualizeDuration()

In [None]:
# visualize the runtime of ALITE versus the number of input tuples in the database
benchmarker.VisualizeRuntimePerTuple(inputTuples=True)

In [None]:
# visualize the runtime of ALITE versus the size of the full disjunction
benchmarker.VisualizeRuntimePerTuple(inputTuples=False)

In [None]:
# visualize the silhouette scores calculated while finding the optimum number of column clusters
benchmarker.VisualizeSilhouetteScores()

In [None]:
# visualize the predicted number of columns in the full disjunction vs. the actual number of columns
benchmarker.VisualizeClusterStatistics('pre', 'act', scatter=True, with_reg=True)

In [None]:
# visualize the number of input tables vs the clustering precision
benchmarker.VisualizeClusterStatistics('tc', 'p', scatter=True, with_reg=True)

In [None]:
# visualize the total column count of all input tables vs the clustering precision
benchmarker.VisualizeClusterStatistics('max', 'p', scatter=True, with_reg=True)

In [None]:
# visualize the number of columns in the full disjunction vs the clustering precision
benchmarker.VisualizeClusterStatistics('act', 'p', scatter=True, with_reg=True)

In [None]:
# visualize the number of input tables vs the clustering recall
benchmarker.VisualizeClusterStatistics('tc', 'r', scatter=True, with_reg=True)

In [None]:
# visualize the total column count of all input tables vs the clustering recall
benchmarker.VisualizeClusterStatistics('max', 'r', scatter=True, with_reg=True)

In [None]:
# visualize the number of columns in the full disjunction vs the clustering recall
benchmarker.VisualizeClusterStatistics('act', 'r', scatter=True, with_reg=True)

In [None]:
# visualize the precision-recall curve of the column clustering
benchmarker.VisualizeClusterStatistics('r', 'p', scatter=False, with_reg=False)

In [None]:
# Calculate the average clustering statistics for all datasets
avg_precision = 0
avg_recall = 0
avg_accuracy = 0
avg_f1 = 0
avg_ratio = 0       # ratio between predicted and actual number of columns in the full disjunction
samples = 0
for dataset in benchmarker.ClusterQuality:
    _, _, _, _, precision, recall, accuracy, f1 = benchmarker.ClusterQuality[dataset]
    _, _, _, predicted, actual = benchmarker.ClusterParameters[dataset]
    avg_precision += precision
    avg_recall += recall
    avg_accuracy += accuracy
    avg_f1 += f1
    avg_ratio += (predicted / actual)
    samples += 1
avg_precision /= samples
avg_recall /= samples
avg_accuracy /= samples
avg_f1 /= samples
avg_ratio /= samples

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average Accuracy: {avg_accuracy}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Predicted vs. Actual Column Count Ratio: {avg_ratio}")