In [None]:
# Install all packeges
import sys

# Upgrades pip to latest
%pip install --upgrade pip

# Install using requirements.txt
%pip install -r requirements.txt

In [None]:
from benchmark import Benchmarker
from database import RelationalDatabase
import os

# folders containing the relevant benchmarks
align_benchmark = os.path.join('Benchmark', 'Selected Align Benchmark')
integration_benchmark = os.path.join('Benchmark', 'Selected Real Benchmark')

# create the benchmarking instance
benchmarker = Benchmarker()


In [None]:
# run all align benchmarks
benchmarker.RunBenchmarks(align_benchmark, None)

In [None]:
# run all real benchmarks
benchmarker.RunBenchmarks(None, integration_benchmark)

In [None]:

# run selected align benchmarks if desired
selected = ['pubs', 'us_politicians', 'wholesale_markets']
for dataset_name in selected:
    dataset_path = os.path.join(align_benchmark, dataset_name)
    database = RelationalDatabase()
    database.LoadFromFolder(dataset_path)
    benchmarker.ClusteringQualityStatistics(database, dataset_name)


In [None]:

# run selected real benchmarks if desired
selected = ['chicago_parks', 'stockport-contracts']
for dataset_name in selected:
    dataset_path = os.path.join(integration_benchmark, dataset_name)
    database = RelationalDatabase()
    database.LoadFromFolder(dataset_path)
    benchmarker.Benchmark2(database, dataset_name, 'ALITE')

In [None]:
# visualize the runtime of ALITE across multiple datasets
benchmarker.VisualizeDuration()

In [None]:
# visualize the runtime of ALITE versus the number of input tuples in the database
benchmarker.VisualizeRuntimePerTuple(inputTuples=True)

In [None]:
# visualize the silhouette scores calculated while finding the optimum number of column clusters
benchmarker.VisualizeSilhouetteScores('archives')

In [None]:
# visualize the predicted number of columns in the full disjunction vs. the actual number of columns
benchmarker.VisualizeClusterStatistics('pre', 'act', scatter=True, with_reg=True)

In [None]:
# visualize the duration of column clustering vs the number of columns in the original database
benchmarker.VisualizeClusterStatistics('max', 'dur', scatter=True, with_reg=True)

In [None]:
# Calculate the average clustering statistics for all datasets
avg_precision = 0
avg_recall = 0
avg_accuracy = 0
avg_f1 = 0
avg_ratio = 0       # ratio between predicted and actual number of columns in the full disjunction
samples = 0
for dataset in benchmarker.ClusterQuality:
    _, _, _, _, precision, recall, accuracy, f1 = benchmarker.ClusterQuality[dataset]
    _, _, _, predicted, actual = benchmarker.ClusterParameters[dataset]
    avg_precision += precision
    avg_recall += recall
    avg_accuracy += accuracy
    avg_f1 += f1
    avg_ratio += (predicted / actual)
    samples += 1
avg_precision /= samples
avg_recall /= samples
avg_accuracy /= samples
avg_f1 /= samples
avg_ratio /= samples

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average Accuracy: {avg_accuracy}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Predicted vs. Actual Column Count Ratio: {avg_ratio}")
