# 0. Configuration

In [1]:
# Hops
import hsfs
from hops import hdfs
# Spark
from pyspark.sql import DataFrame, SQLContext
from pyspark.sql.functions import col, lit
# Implementations
from ackuq.pit import PitContext
# SparkMeasure
from sparkmeasure import StageMetrics

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
67,application_1646303173729_0026,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
connection = hsfs.connection()
fs = connection.get_feature_store()

# Storage connector to s3
sc = fs.get_storage_connector("experiment-s3")

Connected. Call `.close()` to terminate connection gracefully.

In [3]:
spark.sql("use {}".format(hdfs.project_name()))

DataFrame[]

In [4]:
sc.prepare_spark()

In [5]:
sql_context = SQLContext(spark.sparkContext)
pit_context = PitContext(sql_context)

In [6]:
stage_metrics = StageMetrics(spark)

# 1. Experiment declarations

In [7]:
def exploding_experiment(left: DataFrame, right: DataFrame):
    stage_metrics.begin()
    pit_context.exploding(
        left=left,
        right=right,
        left_ts_column=left["ts"],
        right_ts_column=right["ts"],
        partition_cols = [(left["id"], right["id"])],
    ).count()
    stage_metrics.end()

In [8]:
def union_experiment(left: DataFrame, right: DataFrame):
    stage_metrics.begin()
    pit_context.union(
        left=left,
        right=right,
        right_prefix="right_",
        left_ts_column = "ts",
        right_ts_column = "ts",
        partition_cols=["id"],
    ).count()
    stage_metrics.end()

In [9]:
def early_stop_sort_merge(left: DataFrame, right: DataFrame):
    stage_metrics.begin()
    left.join(
        right,
        pit_context.pit_udf(left["ts"], right["ts"]) & (left["id"] == right["id"])
    ).count()
    stage_metrics.end()

# 2. Run experiments

In [10]:
FUNCTIONS = [exploding_experiment, union_experiment, early_stop_sort_merge]

DATASETS =[
    "/{}-1_year".format(size) 
    for size in [
        10_000,
        100_000,
        1_000_000,
        10_000_000
    ]
]
NO_RUNS = 20
DATASET_STRUCTURE = [
    "/raw",
    "/sorted-asc",
    "/sorted-desc"
]
S3_BASE_DIRECTORY = "s3a://" + sc.bucket + "/axel_experiments"
OUTPUT_PATH = "hdfs:///Projects/" + hdfs.project_name() + "/Jupyter/PIT-joins/experiment/output"

In [11]:
def do_run(left, right, function, output_path):
    combined_df = None
    for run in range(NO_RUNS):
        function(left, right)
        _stage_metrics_df = stage_metrics.create_stagemetrics_DF()
        aggregated_df = stage_metrics.aggregate_stagemetrics_DF().withColumn("runNumber", lit(run + 1))
        if combined_df is None:
            combined_df = aggregated_df
        else:
            combined_df = combined_df.union(aggregated_df)
    combined_df.repartition(1).write.mode("overwrite").option("header", True).csv(output_path)

In [34]:
for structure in DATASET_STRUCTURE:
    for dataset in DATASETS:
        dataset_path = S3_BASE_DIRECTORY + structure + dataset
        left = spark.read.format("parquet").load(dataset_path + "/left.parquet").persist()
        left.count()
        right = spark.read.format("parquet").load(dataset_path + "/right.parquet").persist()
        right.count()
        for function in FUNCTIONS:
            output_path = OUTPUT_PATH + "/{}/{}/{}.csv".format(structure, dataset, function.__name__)
            do_run(left, right, function, output_path)

KeyboardInterrupt: 

# 3. Run bucketing experiments

In [13]:
# Three different bucket sizes
BUCKETS = [20, 40, 80]

# Only do RAW dataset
structure = DATASET_STRUCTURE[0]

for bucket in BUCKETS:
    for dataset in DATASETS:
        dataset_path = S3_BASE_DIRECTORY + structure + dataset
        left_raw = spark.read.format("parquet").load(dataset_path + "/left.parquet")
        left_raw.write.bucketBy(buckets, "id").mode("overwrite").saveAsTable("left")

        left = spark.table("left").persist()
        left.count()

        right_raw = spark.read.format("parquet").load(dataset_path + "/right.parquet").persist()
        right_raw.write.bucketBy(buckets, "id").mode("overwrite").saveAsTable("right")

        right = spark.table("right").persist()
        right.count()

        for function in FUNCTIONS:
            output_path = OUTPUT_PATH + "/{}_bucketed_{}/{}/{}.csv".format(structure, buckets, dataset, function.__name__)
            do_run(left, right, function, output_path)

10000
495825

In [23]:
import copy
_schema = copy.deepcopy(right.schema)
left_big = right.rdd.toDF(_schema)

In [29]:
left.count()

10000

In [None]:
pit_context.union(
        left=left,
        right=right,
        right_prefix="right_",
        left_ts_column = "ts",
        right_ts_column = "ts",
        partition_cols=["id"],
).count()

In [34]:
pit_context.exploding(
        left=left_big.hint("MERGE"),
        right=right,
        left_ts_column=left_big["ts"],
        right_ts_column=right["ts"],
        partition_cols = [(left_big["id"], right["id"])],
).count()

490573

In [35]:
left.join(
        right,
        pit_context.pit_udf(left["ts"], right["ts"]) & (left["id"] == right["id"])
    ).count()

7425