In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

### TPC-H

In [None]:
base_dir = "results_tpch"  

data = []

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file == "execution_times.txt":
            path_parts = root.split(os.sep)
            
            size_class = path_parts[-3]  
            sf = path_parts[-2]          
            query_number = path_parts[-1]  
            
            file_path = os.path.join(root, file)
            
            with open(file_path, 'r') as f:
                execution_times = f.readlines()
            
            for time in execution_times:
                data.append({
                    "Execution Time": float(time.strip()),
                    "Query Number": query_number,
                    "Scaling Factor (SF)": sf,
                    "Warehouse Size": size_class
                })

df = pd.DataFrame(data)


In [None]:
fig, axes = plt.subplots(4, 4, figsize=(20, 20), sharey=False)
sfls = ['SF_1', 'SF_10', 'SF_100', 'SF_1000']
size_classes = ['XS', 'S', 'M', 'L']

for row, size_class in enumerate(size_classes):
    for col, sf in enumerate(sfls):
        ax = axes[row, col]  
        
        subset = df[(df["Warehouse Size"] == size_class) & (df["Scaling Factor (SF)"] == sf)]
        
        if not subset.empty:
            subset.boxplot(
                by="Query Number",
                column="Execution Time",
                ax=ax,
                grid=True,
                patch_artist=False
            )
        
        ax.set_title(f"{size_class} - {sf}")
        if col == 0:
            ax.set_ylabel("Execution Time (s)")
        if row == len(size_classes) - 1:
            ax.set_xlabel("Query Number")

plt.suptitle("TPC-H Benchmark Across Warehouse Sizes and Scaling Factors", fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.95)

plt.show()

In [None]:
summary_data = []

size_classes = ['XS', 'S', 'M', 'L']
sfls = ['SF_1', 'SF_10', 'SF_100', 'SF_1000']

for size_class in size_classes:
    for sf in sfls:
        subset_sf = df[(df["Warehouse Size"] == size_class) & (df["Scaling Factor (SF)"] == sf)]
        
        for query in subset_sf["Query Number"].unique():
            subset_query = subset_sf[subset_sf["Query Number"] == query]
            
            if not subset_query.empty:
                stats = subset_query["Execution Time"].describe()
                summary_data.append({
                    "Warehouse Size": size_class,
                    "Scaling Factor": sf,
                    "Query Number": query,
                    "Count": int(stats["count"]),
                    "Mean": round(stats["mean"], 2),
                    "Std Dev": round(stats["std"], 2),
                    "Min": round(stats["min"], 2),
                    "25%": round(stats["25%"], 2),
                    "50% (Median)": round(stats["50%"], 2),
                    "75%": round(stats["75%"], 2),
                    "Max": round(stats["max"], 2)
                })

summary_df = pd.DataFrame(summary_data)

summary_df.set_index(["Warehouse Size", "Scaling Factor", "Query Number"], inplace=True)

In [None]:
summary_df

### Implementations

In [None]:
python_results = pd.read_csv(
    'implementations/results/Python/execution_times.txt', 
    names=['execution_time'], 
    header=None  
)

In [None]:
sql_results = pd.read_csv(
    'implementations/results/SQL/execution_times.txt', 
    names=['execution_time'], 
    header=None  
)

In [None]:
sql_queries_results = pd.read_json(
    'implementations/results/SQL/per_query_times.json', 
)

In [None]:
data = [sql_results['execution_time'], python_results['execution_time']]

plt.figure(figsize=(8, 5))
plt.boxplot(data)

plt.xticks([1, 2], ['SQL', 'Python'], fontsize=12)
plt.title('Naive Bayes Training Performance Comparison', fontsize=14, fontweight='bold')
plt.ylabel('Execution Time (s)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sql_queries_results.boxplot(grid=False)

plt.title('SQL Implementation - Query Performance', fontsize=14)
plt.ylabel('Execution time (s)', fontsize=10)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()