# Optimizations for PIT-joins

This notebook will consist of several optimizations for the existing join method. Stuff that will be looked into is the unoptimized PIT-join as well as optimizations.

## 0. Data preparations

In [1]:
# Imports
import datetime
from pyspark.sql import DataFrame, Row, SparkSession, Window, SQLContext
from pyspark.sql.types import StructField, IntegerType, StringType, StructType
from pyspark.sql.functions import unix_timestamp, from_unixtime, col
from pyspark.sql import functions as F
from hops import hdfs as hdfs
from sparkmeasure import StageMetrics


Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
21,application_1642582607798_0020,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
data1 = [[1, 5, "1x"],
         [1, 7, "1y"],
         [1, 4, "1z"],
         [2, 6, "2x"],
         [2, 8, "2y"]]

data2 = [[1, 5, "1x"],
         [1, 7, "1y"],
         [1, 4, "1z"],
         [2, 6, "2x"],
         [2, 8, "2y"]]

data3 = [[1, 10, "f3-1-10"],
         [1, 1, "f3-1-1"],
         [1, 6, "f3-1-6"],
         [2, 2, "f3-2-2"],
         [2, 8, "f3-2-8"]]


In [3]:
spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Three Way PIT") \
    .enableHiveSupport() \
    .getOrCreate()

sql_context = SQLContext(spark)

In [27]:
stagemetrics = StageMetrics(spark)

In [5]:
# Used for storing of tables
PROJECT_NAME = hdfs.project_name()
spark.sql("use " + PROJECT_NAME)

DataFrame[]

In [6]:
fg1_schema = StructType([
  StructField("id", IntegerType(), False),
  StructField("ts", IntegerType(), False),
  StructField("label", StringType(), False)    
])

fg2_schema = StructType([
  StructField("id_2", IntegerType(), False),
  StructField("ts_2", IntegerType(), False),
  StructField("f2", StringType(), False)
])

fg3_schema = StructType([
  StructField("id_3", IntegerType(), False),
  StructField("ts_3", IntegerType(), False),
  StructField("f3", StringType(), False)
])

In [7]:
# fg1 = spark.createDataFrame(data1, schema=fg1_schema) 
# fg2 = spark.createDataFrame(data2, schema=fg2_schema) 
# fg3 = spark.createDataFrame(data3, schema=fg3_schema) 

In [8]:
DATA_PATH = "hdfs:///Projects/" + hdfs.project_name() + "/Jupyter/PIT-joins/example-data"

fg1 = spark.read.csv(
    DATA_PATH + "/100000-20-1-out.csv", header=True, schema=fg1_schema
).sort(F.desc("ts")).persist()
fg1.count()

fg2 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg2_schema
).sort(F.desc("ts_2")).persist()
fg2.count()


fg3 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg3_schema
).sort(F.desc("ts_3")).persist()
fg3.count()


36779

In [9]:
fg1.show()

+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1400|   f1|
|98163|1400|   f1|
|98164|1400|   f1|
|98165|1400|   f1|
|98166|1400|   f1|
|98167|1400|   f1|
|98168|1400|   f1|
|98169|1400|   f1|
|98170|1400|   f1|
|98171|1400|   f1|
|98172|1400|   f1|
|98173|1400|   f1|
|98174|1400|   f1|
|98175|1400|   f1|
|98176|1400|   f1|
|98177|1400|   f1|
|98178|1400|   f1|
|98179|1400|   f1|
|98180|1400|   f1|
|98181|1400|   f1|
+-----+----+-----+
only showing top 20 rows

# 1. Regular PIT-join

In [10]:
def pit_join(join_hint, label_data, *fgs):
    # 1. Join the data
    joined_data = label_data
    for i, fg in enumerate(fgs):
        fg_id = "id_{}".format(i + 2)
        fg_ts = "ts_{}".format(i + 2)
        joined_data = joined_data.hint(join_hint).join(
            fg, (label_data.id == getattr(fg, fg_id)) & (label_data.ts >= getattr(fg, fg_ts))
        ).unpersist()

    # 2. Create window for partitioning and ordering the data
    order_by_param = [F.desc("ts_{}".format(i + 2)) for i in range(len(fgs))]
    win = Window.partitionBy(["id", "ts"]).orderBy(*order_by_param)

    # 3. Rank the rows of each partition
    ranked_data = joined_data.withColumn("rank", F.rank().over(win)).unpersist()

    # 4. Take only the columns with rank == 1, for each partition
    filtered_data = ranked_data.filter(F.col("rank") == 1).unpersist()
    
    return filtered_data.count()

In [32]:
NO_RUNS = 1
HINTS = ["BROADCAST", "MERGE"]

def experiment(label_data, *fgs):
    for hint in HINTS:
        print("Running with {}".format(hint))
        for run in range(NO_RUNS):
            stagemetrics.begin()
            pit_join(hint, label_data, *fgs)
            stagemetrics.end()
            # stagemetrics.print_report()
        df = stagemetrics.create_stagemetrics_DF("PerfStageMetrics").limit(NO_RUNS)
        df.show()

In [33]:
# One feature group

experiment(fg1, fg2)

Running with BROADCAST
+-----+--------------------+-------+--------------------+--------------+--------------+-------------+--------+---------------+---------------+-----------------------+--------------------------+-----------------------+---------+----------+----------------+------------------+-------------------+-----------+---------+--------------+------------+--------------------+---------------------+-------------------------+-------------------------+--------------------------+---------------------+----------------------+----------------------------+------------------+----------------+-------------------+---------------------+
|jobId|            jobGroup|stageId|                name|submissionTime|completionTime|stageDuration|numTasks|executorRunTime|executorCpuTime|executorDeserializeTime|executorDeserializeCpuTime|resultSerializationTime|jvmGCTime|resultSize|diskBytesSpilled|memoryBytesSpilled|peakExecutionMemory|recordsRead|bytesRead|recordsWritten|bytesWritten|shuffleFetchWa

In [13]:
# Two feature groups

experiment(fg1, fg2, fg3)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



In [14]:
df = stagemetrics.create_stagemetrics_DF("PerfStageMetrics")
df.show()

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
NameError: name 'stagemetrics' is not defined



# 2. Pre-sorted

In [15]:
# Simulate pre-sorting of data
# Data is already stored pre-sorted
DATA_PATH = "hdfs:///Projects/" + hdfs.project_name() + "/Jupyter/PIT-joins/example-data"

sorted_fg1 = spark.read.csv(
    DATA_PATH + "/100000-20-1-out.csv", header=True, schema=fg1_schema
).persist()
fg1.count()

sorted_fg2 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg2_schema
).persist()
fg2.count()


sorted_fg3 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg3_schema
).persist()
fg3.count()

36779

In [16]:
sorted_fg1.show()

+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1040|   f1|
|98162|1060|   f1|
|98162|1080|   f1|
|98162|1100|   f1|
|98162|1120|   f1|
|98162|1140|   f1|
|98162|1160|   f1|
|98162|1180|   f1|
|98162|1200|   f1|
|98162|1220|   f1|
|98162|1240|   f1|
|98162|1260|   f1|
|98162|1280|   f1|
|98162|1300|   f1|
|98162|1320|   f1|
|98162|1340|   f1|
|98162|1360|   f1|
|98162|1380|   f1|
|98162|1400|   f1|
|98163|1020|   f1|
+-----+----+-----+
only showing top 20 rows

In [17]:
# One feature group

experiment(sorted_fg1, sorted_fg2)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



In [18]:
# Two feature groups

experiment(sorted_fg1, sorted_fg2, sorted_fg3)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



# 3. Pre-partitioning

In [19]:
# Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id_2").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id_3").saveAsTable("fg3_bucketed")

In [20]:
bucketed_fg1 = spark.table("fg1_bucketed").persist()
bucketed_fg2 = spark.table("fg2_bucketed").persist()
bucketed_fg3 = spark.table("fg3_bucketed").persist()

In [21]:
# One feature group

experiment(bucketed_fg1, bucketed_fg2)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



In [22]:
# Two feature groups

experiment(bucketed_fg1, bucketed_fg2, bucketed_fg3)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



# 4. Pre-partitioning and pre-sorting

In [23]:
# Parition the data based on id
sorted_fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
sorted_fg2.write.mode("overwrite").bucketBy(4, "id_2").saveAsTable("fg2_bucketed_sorted")
sorted_fg3.write.mode("overwrite").bucketBy(4, "id_3").saveAsTable("fg3_bucketed_sorted")

In [24]:
sorted_bucketed_fg1 = spark.table("fg1_bucketed_sorted").persist()
sorted_bucketed_fg2 = spark.table("fg2_bucketed_sorted").persist()
sorted_bucketed_fg3 = spark.table("fg3_bucketed_sorted").persist()

In [25]:
# One feature group

experiment(sorted_bucketed_fg1, sorted_bucketed_fg2)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined



In [26]:
# Two feature groups

experiment(sorted_bucketed_fg1, sorted_bucketed_fg2, sorted_bucketed_fg3)

An error was encountered:
name 'stagemetrics' is not defined
Traceback (most recent call last):
  File "<stdin>", line 8, in experiment
NameError: name 'stagemetrics' is not defined

