In [None]:
import json
import duckdb as ddb
import pandas as pd
import pingouin as pg
import numpy as np
import matplotlib.pyplot as plt

In [None]:
SF = "s10"
with open(f"{SF}_metrics.json", "r") as file: 
    metrics = json.load(file)

In [None]:
def calculate_key_metrics_from_query_metrics(metric):
    metrics_df = pd.DataFrame([ {"run_number":i, "query_type":key, **value}  for i,j in enumerate(metric) for key, value in j.items()])
    metrics_df["time"] = metrics_df["time"]/1000
    metrics_df["data_scanned"] = metrics_df["data_scanned"]/1000000
    metrics_df["planning_time"] = metrics_df["planning_time"]/1000
    metrics_df["execution_time"] = metrics_df["execution_time"]/1000
    key_metrics_per_query = ddb.sql("""
        SELECT query_type, 
        avg(time):: DECIMAL(6,2) as avg_time, 
        avg(data_scanned):: DECIMAL(6,2) as avg_data_scanned, 
        avg(planning_time):: DECIMAL(6,2) as avg_planning_time, 
        avg(execution_time):: DECIMAL(6,2) as avg_execution_time,
        stddev(time):: DECIMAL(6,2) as stddev_time 
        FROM
        metrics_df
        GROUP BY query_type
        ORDER BY query_type asc
        """).df()
    
    key_metrics_per_group = ddb.sql("""
        SELECT query_type[2:2] as query_group, run_number, 
        avg(time):: DECIMAL(6,2) as avg_time, 
        avg(data_scanned):: DECIMAL(6,2) as avg_data_scanned
        FROM
        metrics_df
        GROUP BY query_type[2:2], run_number
        ORDER BY query_type[2:2], run_number asc
        """).df()
    
    key_metrics_arr = ddb.sql("""
        SELECT query_type, 
        list(time) as time, 
        list(data_scanned) as data_scanned
        FROM
        metrics_df
        GROUP BY query_type
        ORDER BY query_type asc
        """).df()
    return key_metrics_per_query, key_metrics_per_group, key_metrics_arr, metrics_df

In [None]:
queries_full_hive = list(metrics["hive"]["queries_full"].values())
queries_incremental_hive = list(metrics["hive"]["queries_incremental"].values())

queries_full_iceberg = list(metrics["iceberg"]["queries_full"].values())
queries_incremental_iceberg = list(metrics["iceberg"]["queries_incremental"].values())
queries_rewrite_iceberg = list(metrics["iceberg"]["queries_rewrite"].values())

## Full load metrics

In [None]:
per_query_hive_full, per_group_hive_full, per_query_arr_hive_full, metrics_hive_full = calculate_key_metrics_from_query_metrics(queries_full_hive)
per_query_hive_full

In [None]:
per_query_iceberg_full, per_group_iceberg_full, per_query_arr_iceberg_full, metrics_iceberg_full = calculate_key_metrics_from_query_metrics(queries_full_iceberg)
per_query_iceberg_full

In [None]:
per_query_iceberg_full.sum(), per_query_hive_full.sum()

### Plotting query times

In [None]:
# Example data
queries = per_query_hive_full["query_type"]  # Labels for queries
series1_mean = per_query_hive_full["avg_time"]  # Random mean values for Series 1
series2_mean = per_query_iceberg_full["avg_time"]  # Random mean values for Series 2
series1_stdev = per_query_hive_full["stddev_time"]    # Random standard deviations for Series 1
series2_stdev = per_query_iceberg_full["stddev_time"]    # Random standard deviations for Series 2

# Calculating 95% confidence intervals
confidence_95 = 1.96  # 95% confidence level
series1_conf = confidence_95 * series1_stdev
series2_conf = confidence_95 * series2_stdev

# Bar chart
x = np.arange(len(queries))  # X-axis positions
width = 0.35                 # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))

# Plot bars with error bars for confidence intervals
bars1 = ax.bar(x - width/2, series1_mean, width, yerr=series1_conf, label='Hive', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars2 = ax.bar(x + width/2, series2_mean, width, yerr=series2_conf, label='Iceberg', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})

# Adding mean value labels on top of each bar
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)

# Customize the chart
ax.set_xlabel('Query', fontsize=12)
ax.set_ylabel('Time (s)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(queries, rotation=45, ha='right', fontsize=10)
ax.legend(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


### Apllying t tests for each query

In [None]:
results = []
for i in range(len(per_query_arr_hive_full)):
    mean = (per_query_arr_iceberg_full.iloc[i]["time"]/per_query_arr_hive_full.iloc[i]["time"]).mean()
    result = pg.ttest(list(per_query_arr_iceberg_full.iloc[i]["time"]),list(per_query_arr_hive_full.iloc[i]["time"]),correction=True)
    result = pg.ttest(list(per_query_arr_iceberg_full.iloc[i]["time"]),list(per_query_arr_hive_full.iloc[i]["time"]),correction=True)
    results.append({"query_type": per_query_arr_hive_full.iloc[i]["query_type"], **result.iloc[0], "mean":mean})
pd.DataFrame(results)

### 1 sample t-test for avg and raw values

In [None]:
print((per_query_iceberg_full["avg_time"]/per_query_hive_full["avg_time"]).mean())
pg.ttest(list(per_query_iceberg_full["avg_time"]/per_query_hive_full["avg_time"]),1,correction=True)

In [None]:
print((per_query_iceberg_full["avg_data_scanned"]/per_query_hive_full["avg_data_scanned"]).mean())
pg.ttest(list(per_query_iceberg_full["avg_data_scanned"]/per_query_hive_full["avg_data_scanned"]),1,correction=True)

In [None]:
print((metrics_iceberg_full["time"]/metrics_hive_full["time"]).mean())
pg.ttest(list(metrics_iceberg_full["time"]/metrics_hive_full["time"]),1,correction=True)

In [None]:
print((metrics_iceberg_full["data_scanned"]/metrics_hive_full["data_scanned"]).mean())
pg.ttest(list(metrics_iceberg_full["data_scanned"]/metrics_hive_full["data_scanned"]),1,correction=True)

## Incremental load

In [None]:
per_query_hive_incremental, per_group_hive_incremental, per_query_arr_hive_incremental, metrics_hive_incremental = calculate_key_metrics_from_query_metrics(queries_incremental_hive)
per_query_hive_incremental

In [None]:
per_query_iceberg_incremental, per_group_iceberg_incremental, per_query_arr_iceberg_incremental, metrics_iceberg_incremental = calculate_key_metrics_from_query_metrics(queries_incremental_iceberg)
per_query_iceberg_incremental

In [None]:
per_query_iceberg_rewrite, per_group_iceberg_rewrite, per_query_arr_iceberg_rewrite, metrics_iceberg_rewrite = calculate_key_metrics_from_query_metrics(queries_rewrite_iceberg)
per_query_iceberg_rewrite

In [None]:
per_query_iceberg_incremental.sum(), per_query_hive_incremental.sum(), per_query_iceberg_rewrite.sum()

### Plotting Iceberg v Hive

In [None]:
# Example data
queries = per_query_hive_incremental["query_type"]  # Labels for queries
series1_mean = per_query_hive_incremental["avg_time"]  # Random mean values for Series 1
series2_mean = per_query_iceberg_incremental["avg_time"]  # Random mean values for Series 2
series1_stdev = per_query_hive_incremental["stddev_time"]    # Random standard deviations for Series 1
series2_stdev = per_query_iceberg_incremental["stddev_time"]    # Random standard deviations for Series 2

# Calculating 95% confidence intervals
confidence_95 = 1.96  # 95% confidence level
series1_conf = confidence_95 * series1_stdev
series2_conf = confidence_95 * series2_stdev

# Bar chart
x = np.arange(len(queries))  # X-axis positions
width = 0.35                 # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))

# Plot bars with error bars for confidence intervals
bars1 = ax.bar(x - width/2, series1_mean, width, yerr=series1_conf, label='Hive', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars2 = ax.bar(x + width/2, series2_mean, width, yerr=series2_conf, label='Iceberg', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})

# Adding mean value labels on top of each bar
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)

# Customize the chart
ax.set_xlabel('Query', fontsize=12)
ax.set_ylabel('Time (s)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(queries, rotation=45, ha='right', fontsize=10)
ax.legend(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
results = []
for i in range(len(per_query_arr_hive_incremental)):
    mean = (per_query_arr_iceberg_incremental.iloc[i]["time"]/per_query_arr_hive_incremental.iloc[i]["time"]).mean()
    result = pg.ttest(list(per_query_arr_iceberg_incremental.iloc[i]["time"]),list(per_query_arr_hive_incremental.iloc[i]["time"]),correction=True)
    results.append({"query_type": per_query_arr_hive_incremental.iloc[i]["query_type"], "T": result["T"]["T-test"], "p-val": result["p-val"]["T-test"],"mean":mean})
pd.DataFrame(results)

In [None]:
print((per_query_iceberg_incremental["avg_time"]/per_query_hive_incremental["avg_time"]).mean())
pg.ttest(list(per_query_iceberg_incremental["avg_time"]/per_query_hive_incremental["avg_time"]),1,correction=True)

In [None]:
print((per_query_iceberg_incremental["avg_data_scanned"]/per_query_hive_incremental["avg_data_scanned"]).mean())
pg.ttest(list(per_query_iceberg_incremental["avg_data_scanned"]/per_query_hive_incremental["avg_data_scanned"]),1,correction=True)

In [None]:
print((metrics_iceberg_incremental["time"]/metrics_hive_incremental["time"]).mean())
pg.ttest(list(metrics_iceberg_incremental["time"]/metrics_hive_incremental["time"]),1,correction=True)

In [None]:
print((metrics_iceberg_incremental["data_scanned"]/metrics_hive_incremental["data_scanned"]).mean())
pg.ttest(list(metrics_iceberg_incremental["data_scanned"]/metrics_hive_incremental["data_scanned"]),1,correction=True)

In [None]:
print((per_query_hive_incremental["avg_data_scanned"]/per_query_hive_full["avg_data_scanned"]).mean())
pg.ttest(list(per_query_hive_incremental["avg_data_scanned"]/per_query_hive_full["avg_data_scanned"]),1,correction=True)

In [None]:
print((per_query_iceberg_incremental["avg_data_scanned"]/per_query_iceberg_full["avg_data_scanned"]).mean())
pg.ttest(list(per_query_iceberg_incremental["avg_data_scanned"]/per_query_iceberg_full["avg_data_scanned"]),1,correction=True)

## Full X Incremental X Rewrite

In [None]:
# Example data
queries = per_query_hive_incremental["query_type"]


series1_mean = per_query_hive_incremental["avg_time"]
series1_stdev = per_query_hive_incremental["stddev_time"] 

series2_mean = per_query_hive_full["avg_time"]
series2_stdev = per_query_hive_full["stddev_time"]

series3_mean = per_query_iceberg_incremental["avg_time"] 
series3_stdev = per_query_iceberg_incremental["stddev_time"]

series4_mean = per_query_iceberg_full["avg_time"] 
series4_stdev = per_query_iceberg_full["stddev_time"]

# Calculating 95% confidence intervals
confidence_95 = 1.96  # 95% confidence level
series1_conf = confidence_95 * series1_stdev
series2_conf = confidence_95 * series2_stdev
series3_conf = confidence_95 * series3_stdev
series4_conf = confidence_95 * series4_stdev

# Bar chart
x = np.arange(len(queries))  # X-axis positions
width = 0.2                 # Width of the bars

fig, ax = plt.subplots(figsize=(12, 7))

# Plot bars with error bars for confidence intervals
bars1 = ax.bar(x - 1.5*width, series1_mean, width, label='Hive_incremental', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars2 = ax.bar(x - 0.5*width, series2_mean, width, label='Hive_padrao', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars3 = ax.bar(x + 0.5*width, series3_mean, width, label='Iceberg_incremental', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars4 = ax.bar(x + 1.5*width, series4_mean, width, label='Iceberg_padrao', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})

# Adding mean value labels on top of each bar
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars3:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars4:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)

# Customize the chart
ax.set_xlabel('Query', fontsize=12)
ax.set_ylabel('Time (s)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(queries, rotation=45, ha='right', fontsize=10)
ax.legend(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Example data
queries = per_query_hive_incremental["query_type"]


series1_mean = per_query_iceberg_incremental["avg_time"]
series1_stdev = per_query_iceberg_incremental["stddev_time"] 

series2_mean = per_query_iceberg_rewrite["avg_time"]
series2_stdev = per_query_iceberg_rewrite["stddev_time"]

series3_mean = per_query_iceberg_full["avg_time"] 
series3_stdev = per_query_iceberg_full["stddev_time"]


# Calculating 95% confidence intervals
confidence_95 = 1.96  # 95% confidence level
series1_conf = confidence_95 * series1_stdev
series2_conf = confidence_95 * series2_stdev
series3_conf = confidence_95 * series3_stdev

# Bar chart
x = np.arange(len(queries))  # X-axis positions
width = 0.25                 # Width of the bars

fig, ax = plt.subplots(figsize=(12, 7))

# Plot bars with error bars for confidence intervals
bars1 = ax.bar(x - width, series1_mean, width, label='Iceberg_incremental', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars2 = ax.bar(x , series2_mean, width, label='Iceberg_optimize', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})
bars3 = ax.bar(x + width, series3_mean, width, label='Iceberg_padrao', capsize=5, alpha=0.8, error_kw={'alpha': 0.5})

# Adding mean value labels on top of each bar
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)
for bar in bars3:
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() , f"{bar.get_height():.1f}", ha='center', va='bottom', fontsize=10)

# Customize the chart
ax.set_xlabel('Query', fontsize=12)
ax.set_ylabel('Time (s)', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(queries, rotation=45, ha='right', fontsize=10)
ax.legend(fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
print((metrics_iceberg_incremental["time"]/metrics_iceberg_rewrite["time"]).mean())
pg.ttest(list(metrics_iceberg_incremental["time"]/metrics_iceberg_rewrite["time"]),1,correction=True)

In [None]:
print((per_query_iceberg_incremental["avg_data_scanned"]/per_query_iceberg_rewrite["avg_data_scanned"]).mean())
pg.ttest(list(per_query_iceberg_incremental["avg_data_scanned"]/per_query_iceberg_rewrite["avg_data_scanned"]),1,correction=True)

In [None]:
print((metrics_iceberg_incremental["data_scanned"]/metrics_iceberg_rewrite["data_scanned"]).mean())
pg.ttest(list(metrics_iceberg_incremental["data_scanned"]/metrics_iceberg_rewrite["data_scanned"]),1,correction=True)