# Optimizations for PIT-joins

This notebook will consist of several optimizations for the existing join method. Stuff that will be looked into is the unoptimized PIT-join as well as optimizations.

# 0. Data preparations

In [1]:
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
36,application_1642582607798_0035,spark,idle,Link,Link


SparkSession available as 'spark'.
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


In [2]:
import io.github.ackuq._

import io.github.ackuq._


In [3]:
val fg1_schema = StructType(Array(
  StructField("id", IntegerType, false),
  StructField("ts", IntegerType, false),
  StructField("label", StringType, false)    
))

val fg2_schema = StructType(Array(
  StructField("id_2", IntegerType, false),
  StructField("ts_2", IntegerType, false),
  StructField("f2", StringType, false)    
))

val fg3_schema = StructType(Array(
  StructField("id_3", IntegerType, false),
  StructField("ts_3", IntegerType, false),
  StructField("f3", StringType, false)    
))

fg1_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,false), StructField(ts,IntegerType,false), StructField(label,StringType,false))
fg2_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id_2,IntegerType,false), StructField(ts_2,IntegerType,false), StructField(f2,StringType,false))
fg3_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id_3,IntegerType,false), StructField(ts_3,IntegerType,false), StructField(f3,StringType,false))


In [4]:
val spark = SparkSession.builder().master("local").appName("PIT Optimizations Scala").enableHiveSupport().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@738e1931


In [5]:
spark.sql("use " + Hops.getProjectName)

res3: org.apache.spark.sql.DataFrame = []


In [6]:
val data1 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data2 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data3 = Seq(
    Row(1, 1, "f3-1-1"),
    Row(2, 2, "f3-2-2"),
    Row(1, 6, "f3-1-6"),
    Row(2, 8, "f3-2-8"),
    Row(1, 10, "f3-1-10")
)

data1: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data2: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data3: Seq[org.apache.spark.sql.Row] = List([1,1,f3-1-1], [2,2,f3-2-2], [1,6,f3-1-6], [2,8,f3-2-8], [1,10,f3-1-10])


In [7]:
/*
val fg1 = spark.createDataFrame(spark.sparkContext.parallelize(data1), schema=fg1_schema) 
val fg2 = spark.createDataFrame(spark.sparkContext.parallelize(data2), schema=fg2_schema) 
val fg3 = spark.createDataFrame(spark.sparkContext.parallelize(data3), schema=fg3_schema) 
*/

In [None]:
val DATA_PATH = "hdfs:///Projects/" + Hops.getProjectName + "/Jupyter/PIT-joins/example-data"


val fg1 = spark.read.option("header", true).schema(fg1_schema).csv(
    DATA_PATH + "/100000-20-1-out.csv"
).sort(desc("ts")).persist()
fg1.count()

val fg2 = spark.read.option("header", true).schema(fg2_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts_2")).persist()
fg2.count()

val fg3 = spark.read.option("header", true).schema(fg3_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts_3")).persist()
fg3.count()


In [None]:
fg1.show

# 1. Regular PIT-join

In [None]:
def pitJoin(joinHint: String, labelData: DataFrame, fgs: DataFrame*) : DataFrame = {
    var joinedData = labelData
    for ((fg, i) <- fgs.zipWithIndex) {
        val id = i + 2
        val fg_id = s"id_${id}"
        val fg_ts = s"ts_${id}"
        joinedData = joinedData.hint(joinHint).join(
            fg, (labelData("id") === fg(fg_id)) && (labelData("ts") >= fg(fg_ts))
        ).unpersist()
    }
    // 2. Create window for partitioning and ordering the data
    val orderByParams = 
        for (i <- 0 to fgs.length - 1)
        yield desc(s"ts_${i + 2}")

    val win = Window.partitionBy("id", "ts").orderBy(orderByParams :_*)

    // 3. Rank the rows of each partition
    val rankedData = joinedData.withColumn("rank", rank().over(win))

    // 4. Take only the columns with rank == 1, for each partition
    val filteredData = rankedData.filter(col("rank") === 1)
    
    filteredData
}

In [None]:
pitJoin("BROADCAST", fg1, fg2).show

In [None]:
val NO_RUNS = 10
val HINTS = List("BROADCAST", "MERGE")

def experiment(labelData: DataFrame, fgs: DataFrame*) : Unit = {
    for (hint <- HINTS) {
        println(s"Running with ${hint}")
        for (run <- 0 to NO_RUNS) {
            spark.time(pitJoin(hint, labelData, fgs :_*).count)
        }
    }
}

In [21]:
pitJoin("BROADCAST", fg1, fg2).explain(extended=true)
pitJoin("BROADCAST", fg1, fg2).count

== Parsed Logical Plan ==
'Filter ('rank = 1)
+- Project [id#0, ts#1, label#2, id_2#92, ts_2#93, f2#94, rank#10861]
   +- Project [id#0, ts#1, label#2, id_2#92, ts_2#93, f2#94, rank#10861, rank#10861]
      +- Window [rank(ts_2#93) windowspecdefinition(id#0, ts#1, ts_2#93 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank#10861], [id#0, ts#1], [ts_2#93 DESC NULLS LAST]
         +- Project [id#0, ts#1, label#2, id_2#92, ts_2#93, f2#94]
            +- Join Inner, ((id#0 = id_2#92) AND (ts#1 >= ts_2#93))
               :- ResolvedHint (strategy=broadcast)
               :  +- Sort [ts#1 DESC NULLS LAST], true
               :     +- Relation[id#0,ts#1,label#2] csv
               +- Sort [ts_2#93 DESC NULLS LAST], true
                  +- Relation[id_2#92,ts_2#93,f2#94] csv

== Analyzed Logical Plan ==
id: int, ts: int, label: string, id_2: int, ts_2: int, f2: string, rank: int
Filter (rank#10861 = 1)
+- Project [id#0, ts#1, label#2, id_2#92, ts

In [12]:
// One Feature group

experiment(fg1, fg2)

Running with BROADCAST
Time taken: 12893 ms
Time taken: 8960 ms
Time taken: 7949 ms
Time taken: 8183 ms
Time taken: 7470 ms
Time taken: 6957 ms
Time taken: 7147 ms
Time taken: 6705 ms
Time taken: 6560 ms
Time taken: 7296 ms
Time taken: 6240 ms
Running with MERGE
Time taken: 9180 ms
Time taken: 7893 ms
Time taken: 8463 ms
Time taken: 7567 ms
Time taken: 7843 ms
Time taken: 7935 ms
Time taken: 8201 ms
Time taken: 7659 ms
Time taken: 7557 ms
Time taken: 7913 ms
Time taken: 7766 ms


In [13]:
// Two feature groups

experiment(fg1, fg2, fg3)

Running with BROADCAST
Time taken: 18200 ms
Time taken: 17016 ms
Time taken: 17122 ms
Time taken: 17084 ms
Time taken: 17591 ms
Time taken: 17793 ms
Time taken: 17440 ms
Time taken: 17626 ms
Time taken: 17071 ms
Time taken: 17242 ms
Time taken: 17171 ms
Running with MERGE
Time taken: 15068 ms
Time taken: 13814 ms
Time taken: 13820 ms
Time taken: 14225 ms
Time taken: 13754 ms
Time taken: 14151 ms
Time taken: 13792 ms
Time taken: 13953 ms
Time taken: 13622 ms
Time taken: 14056 ms
Time taken: 13980 ms


# 2. Pre-sorted

In [13]:
// Simulate pre-sorting of data
// Data is already stored pre-sorted

val sortedFg1 = spark.read.option("header", true).schema(fg1_schema).csv(
    DATA_PATH + "/100000-20-1-out.csv"
).persist()
sortedFg1.count()

val sortedFg2 = spark.read.option("header", true).schema(fg2_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).persist()
sortedFg2.count()

val sortedFg3 = spark.read.option("header", true).schema(fg3_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).persist()
sortedFg3.count()

sortedFg1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res26: Long = 36779
sortedFg2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_2: int, ts_2: int ... 1 more field]
res28: Long = 36779
sortedFg3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_3: int, ts_3: int ... 1 more field]
res30: Long = 36779


In [14]:
// One Feature group

experiment(sortedFg1, sortedFg2)

Running with BORADCAST
Time taken: 4174 ms
Time taken: 3861 ms
Time taken: 3915 ms
Time taken: 3854 ms
Time taken: 3903 ms
Time taken: 3908 ms
Time taken: 3910 ms
Time taken: 3864 ms
Time taken: 3885 ms
Time taken: 3845 ms
Time taken: 3918 ms
Running with MERGE
Time taken: 4291 ms
Time taken: 4187 ms
Time taken: 4198 ms
Time taken: 4179 ms
Time taken: 4328 ms
Time taken: 4394 ms
Time taken: 4076 ms
Time taken: 4235 ms
Time taken: 4123 ms
Time taken: 4300 ms
Time taken: 4251 ms


In [15]:
// Two feature groups

experiment(sortedFg1, sortedFg2, sortedFg3)

Running with BORADCAST
Time taken: 10444 ms
Time taken: 10223 ms
Time taken: 10047 ms
Time taken: 10362 ms
Time taken: 10170 ms
Time taken: 10256 ms
Time taken: 10098 ms
Time taken: 10835 ms
Time taken: 9975 ms
Time taken: 10309 ms
Time taken: 10117 ms
Running with MERGE
Time taken: 8209 ms
Time taken: 8336 ms
Time taken: 8580 ms
Time taken: 8636 ms
Time taken: 8490 ms
Time taken: 8252 ms
Time taken: 8108 ms
Time taken: 8328 ms
Time taken: 8402 ms
Time taken: 8326 ms
Time taken: 8290 ms


# 3. Pre-partitioning

In [16]:
// Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id_2").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id_3").saveAsTable("fg3_bucketed")

In [17]:
val bucketedFg1 = spark.table("fg1_bucketed").persist()
val bucketedFg2 = spark.table("fg2_bucketed").persist()
val bucketedFg3 = spark.table("fg3_bucketed").persist()

bucketedFg1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
bucketedFg2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_2: int, ts_2: int ... 1 more field]
bucketedFg3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_3: int, ts_3: int ... 1 more field]


In [18]:
// One Feature group

experiment(bucketedFg1, bucketedFg2)

Running with BORADCAST
Time taken: 9280 ms
Time taken: 651 ms
Time taken: 721 ms
Time taken: 630 ms
Time taken: 681 ms
Time taken: 666 ms
Time taken: 623 ms
Time taken: 634 ms
Time taken: 695 ms
Time taken: 753 ms
Time taken: 667 ms
Running with MERGE
Time taken: 1030 ms
Time taken: 596 ms
Time taken: 529 ms
Time taken: 526 ms
Time taken: 533 ms
Time taken: 561 ms
Time taken: 563 ms
Time taken: 544 ms
Time taken: 594 ms
Time taken: 626 ms
Time taken: 549 ms


In [19]:
// Two feature groups

experiment(bucketedFg1, bucketedFg2, bucketedFg3)

Running with BORADCAST
Time taken: 8131 ms
Time taken: 4114 ms
Time taken: 4154 ms
Time taken: 4131 ms
Time taken: 4090 ms
Time taken: 4117 ms
Time taken: 4428 ms
Time taken: 4269 ms
Time taken: 4274 ms
Time taken: 4157 ms
Time taken: 4149 ms
Running with MERGE
Time taken: 4566 ms
Time taken: 4024 ms
Time taken: 3929 ms
Time taken: 3912 ms
Time taken: 3828 ms
Time taken: 3907 ms
Time taken: 3894 ms
Time taken: 3760 ms
Time taken: 3894 ms
Time taken: 3731 ms
Time taken: 3920 ms


# 4.Pre-partitioning and pre-sorting

In [20]:
// Parition the data based on id
sortedFg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
sortedFg2.write.mode("overwrite").bucketBy(4, "id_2").saveAsTable("fg2_bucketed_sorted")
sortedFg3.write.mode("overwrite").bucketBy(4, "id_3").saveAsTable("fg3_bucketed_sorted")

In [21]:
val sortedBucketedFg1 = spark.table("fg1_bucketed_sorted").persist()
val sortedBucketedFg2 = spark.table("fg2_bucketed_sorted").persist()
val sortedBucketedFg3 = spark.table("fg3_bucketed_sorted").persist()

sortedBucketedFg1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
sortedBucketedFg2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_2: int, ts_2: int ... 1 more field]
sortedBucketedFg3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id_3: int, ts_3: int ... 1 more field]


In [22]:
// One Feature group

experiment(sortedBucketedFg1, sortedBucketedFg2)

Running with BORADCAST
Time taken: 1314 ms
Time taken: 512 ms
Time taken: 554 ms
Time taken: 541 ms
Time taken: 521 ms
Time taken: 539 ms
Time taken: 545 ms
Time taken: 618 ms
Time taken: 606 ms
Time taken: 504 ms
Time taken: 533 ms
Running with MERGE
Time taken: 492 ms
Time taken: 541 ms
Time taken: 540 ms
Time taken: 548 ms
Time taken: 534 ms
Time taken: 607 ms
Time taken: 517 ms
Time taken: 505 ms
Time taken: 568 ms
Time taken: 528 ms
Time taken: 569 ms


In [23]:
// Two feature groups

experiment(sortedBucketedFg1, sortedBucketedFg2, sortedBucketedFg3)

Running with BORADCAST
Time taken: 4231 ms
Time taken: 3863 ms
Time taken: 3765 ms
Time taken: 3822 ms
Time taken: 3759 ms
Time taken: 3879 ms
Time taken: 3837 ms
Time taken: 3841 ms
Time taken: 3872 ms
Time taken: 3886 ms
Time taken: 3831 ms
Running with MERGE
Time taken: 3930 ms
Time taken: 3647 ms
Time taken: 3841 ms
Time taken: 3727 ms
Time taken: 3859 ms
Time taken: 3931 ms
Time taken: 3844 ms
Time taken: 3825 ms
Time taken: 3976 ms
Time taken: 3764 ms
Time taken: 3851 ms
