# Optimizations for PIT-joins

This notebook will consist of several optimizations for the existing join method. Stuff that will be looked into is the unoptimized PIT-join as well as optimizations.

# 0. Data preparations

In [1]:
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
35,application_1642582607798_0034,spark,idle,Link,Link


SparkSession available as 'spark'.
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


In [2]:
import io.github.ackuq.pit.{EarlyStopSortMerge, UnionAsOf}

import io.github.ackuq.pit.{EarlyStopSortMerge, UnionAsOf}


In [3]:
val fg1_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("label", StringType, true)    
))

val fg2_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("f2", StringType, true)    
))

val fg3_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("f3", StringType, true)    
))

fg1_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(label,StringType,true))
fg2_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(f2,StringType,true))
fg3_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(f3,StringType,true))


In [4]:
val spark: SparkSession = SparkSession.builder().master("local").appName("PIT Optimizations Scala").config("spark.sql.adaptive.enabled", true).enableHiveSupport().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@87d8deb


In [5]:
spark.sql("use " + Hops.getProjectName)

res3: org.apache.spark.sql.DataFrame = []


In [6]:
EarlyStopSortMerge.init(spark)

In [7]:
val data1 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data2 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data3 = Seq(
    Row(1, 1, "f3-1-1"),
    Row(2, 2, "f3-2-2"),
    Row(1, 6, "f3-1-6"),
    Row(2, 8, "f3-2-8"),
    Row(1, 10, "f3-1-10")
)

data1: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data2: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data3: Seq[org.apache.spark.sql.Row] = List([1,1,f3-1-1], [2,2,f3-2-2], [1,6,f3-1-6], [2,8,f3-2-8], [1,10,f3-1-10])


In [8]:
/*
val fg1 = spark.createDataFrame(spark.sparkContext.parallelize(data1), schema=fg1_schema) 
val fg2 = spark.createDataFrame(spark.sparkContext.parallelize(data2), schema=fg2_schema) 
val fg3 = spark.createDataFrame(spark.sparkContext.parallelize(data3), schema=fg3_schema) 
*/

In [9]:
val DATA_PATH = "hdfs:///Projects/" + Hops.getProjectName + "/Jupyter/PIT-joins/example-data"


val fg1 = spark.read.option("header", true).schema(fg1_schema).csv(
    DATA_PATH + "/100000-20-1-out.csv"
).sort(desc("ts")).persist()
fg1.count()

val fg2 = spark.read.option("header", true).schema(fg2_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts")).persist()
fg2.count()

val fg3 = spark.read.option("header", true).schema(fg3_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts")).persist()
fg3.count()


DATA_PATH: String = hdfs:///Projects/demo_fs_meb10000/Jupyter/PIT-joins/example-data
fg1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res11: Long = 36779
fg2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res13: Long = 36779
fg3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res15: Long = 36779


In [10]:
fg1.show

+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1400|   f1|
|98163|1400|   f1|
|98164|1400|   f1|
|98165|1400|   f1|
|98166|1400|   f1|
|98167|1400|   f1|
|98168|1400|   f1|
|98169|1400|   f1|
|98170|1400|   f1|
|98171|1400|   f1|
|98172|1400|   f1|
|98173|1400|   f1|
|98174|1400|   f1|
|98175|1400|   f1|
|98176|1400|   f1|
|98177|1400|   f1|
|98178|1400|   f1|
|98179|1400|   f1|
|98180|1400|   f1|
|98181|1400|   f1|
+-----+----+-----+
only showing top 20 rows



In [14]:
val q = fg1.alias("left")\
.select("left.*", "__right__.*").

q.show

q: org.apache.spark.sql.DataFrame = [id: int, ts: int ... 1 more field]
+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1400|   f1|
|98163|1400|   f1|
|98164|1400|   f1|
|98165|1400|   f1|
|98166|1400|   f1|
|98167|1400|   f1|
|98168|1400|   f1|
|98169|1400|   f1|
|98170|1400|   f1|
|98171|1400|   f1|
|98172|1400|   f1|
|98173|1400|   f1|
|98174|1400|   f1|
|98175|1400|   f1|
|98176|1400|   f1|
|98177|1400|   f1|
|98178|1400|   f1|
|98179|1400|   f1|
|98180|1400|   f1|
|98181|1400|   f1|
+-----+----+-----+
only showing top 20 rows



# 1. Union PIT Join

## 1.1. Sorted on timestamps (descending)

In [11]:
def unionPitJoin(labelData: DataFrame, fgs: DataFrame*) : DataFrame = {
    var joinedData = labelData
    for ((fg, i) <- fgs.zipWithIndex) {
        val num = i + 2
        joinedData = UnionAsOf.join(
            joinedData,
            fg,
            rightPrefix = s"fg${num}_",
            partitionCols = Seq("id")
        )
    }
    
    joinedData
}

unionPitJoin(fg1, fg2, fg3).show

unionPitJoin: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)org.apache.spark.sql.DataFrame
+-----+----+-----+------+------+------+------+
|   id|  ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+-----+----+-----+------+------+------+------+
|98164|1020|   f1|  1020|    f2|  1020|    f2|
|98164|1040|   f1|  1040|    f2|  1040|    f2|
|98164|1060|   f1|  1060|    f2|  1060|    f2|
|98164|1080|   f1|  1080|    f2|  1080|    f2|
|98164|1100|   f1|  1100|    f2|  1100|    f2|
|98164|1120|   f1|  1120|    f2|  1120|    f2|
|98164|1140|   f1|  1140|    f2|  1140|    f2|
|98164|1160|   f1|  1160|    f2|  1160|    f2|
|98164|1180|   f1|  1180|    f2|  1180|    f2|
|98164|1200|   f1|  1200|    f2|  1200|    f2|
|98164|1220|   f1|  1220|    f2|  1220|    f2|
|98164|1240|   f1|  1240|    f2|  1240|    f2|
|98164|1260|   f1|  1260|    f2|  1260|    f2|
|98164|1280|   f1|  1280|    f2|  1280|    f2|
|98164|1300|   f1|  1300|    f2|  1300|    f2|
|98164|1320|   f1|  1320|   

In [12]:
unionPitJoin(fg1, fg2).explain(extended=true)

== Parsed Logical Plan ==
Project [id#944, ts#2, label#3, fg2_ts#953, fg2_f2#962]
+- Project [id#944, ts#2, label#3, fg2_ts#953, fg2_f2#962, df_combined_ts#935]
   +- Filter isnotnull(ts#2)
      +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, fg2_f2#962, df_combined_ts#935]
         +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, df_combined_ts#935, fg2_f2#926, fg2_f2#962, fg2_f2#962]
            +- Window [last(fg2_f2#926, true) windowspecdefinition(id#944, df_combined_ts#935 ASC NULLS FIRST, df_index#889 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_f2#962], [id#944], [df_combined_ts#935 ASC NULLS FIRST, df_index#889 ASC NULLS FIRST]
               +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, df_combined_ts#935, fg2_f2#926]
                  +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, fg2_f2#926, df_combined_ts#935]
                     +- Project [id#944, ts#2, label#3, df_ind

In [31]:
unionPitJoin(fg1, fg2).show

+-----+----+-----+------+------+
|   id|  ts|label|fg2_ts|fg2_f2|
+-----+----+-----+------+------+
|98164|1020|   f1|  1020|    f2|
|98164|1040|   f1|  1040|    f2|
|98164|1060|   f1|  1060|    f2|
|98164|1080|   f1|  1080|    f2|
|98164|1100|   f1|  1100|    f2|
|98164|1120|   f1|  1120|    f2|
|98164|1140|   f1|  1140|    f2|
|98164|1160|   f1|  1160|    f2|
|98164|1180|   f1|  1180|    f2|
|98164|1200|   f1|  1200|    f2|
|98164|1220|   f1|  1220|    f2|
|98164|1240|   f1|  1240|    f2|
|98164|1260|   f1|  1260|    f2|
|98164|1280|   f1|  1280|    f2|
|98164|1300|   f1|  1300|    f2|
|98164|1320|   f1|  1320|    f2|
|98164|1340|   f1|  1340|    f2|
|98164|1360|   f1|  1360|    f2|
|98164|1380|   f1|  1380|    f2|
|98164|1400|   f1|  1400|    f2|
+-----+----+-----+------+------+
only showing top 20 rows



In [95]:
val query = unionPitJoin(fg1, fg2)
query.count
query.explain

query: org.apache.spark.sql.DataFrame = [id: int, ts: int ... 3 more fields]
res187: Long = 36779
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [id#74683, ts#47522, label#47523, fg2_ts#74692, fg2_f2#74701]
   +- Filter isnotnull(ts#47522)
      +- Window [last(fg2_f2#74665, true) windowspecdefinition(id#74683, df_combined_ts#74674 ASC NULLS FIRST, df_index#74628 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_f2#74701], [id#74683], [df_combined_ts#74674 ASC NULLS FIRST, df_index#74628 ASC NULLS FIRST]
         +- Project [id#74683, ts#47522, label#47523, df_index#74628, fg2_ts#74692, df_combined_ts#74674, fg2_f2#74665]
            +- Window [last(fg2_ts#74664, true) windowspecdefinition(id#74683, df_combined_ts#74674 ASC NULLS FIRST, df_index#74628 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_ts#74692], [id#74683], [df_combined_ts#74674 ASC NULLS FIRST, df_index#74628 ASC

In [129]:
val NO_RUNS = 10

def experimentUnion(labelData: DataFrame, fgs: DataFrame*) : Unit = {
    for (run <- 1 to NO_RUNS) {
        spark.time(unionPitJoin(labelData, fgs :_*).show(0))
    }
}


NO_RUNS: Int = 10
experimentUnion: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)Unit


In [137]:
unionPitJoin(fg1, fg2).show(0)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows



In [138]:
unionPitJoin(fg1, fg2, fg3).show(0)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows



In [136]:
// One Feature group

experimentUnion(fg1, fg2)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2014 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2194 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1953 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2086 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1998 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1897 ms
+---+---+-

In [131]:
// Two feature groups

experimentUnion(fg1, fg2, fg3)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 13460 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 13476 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 12831 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 13218 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+--

## 1.2. Sorted on id and timestamp

In [142]:
val fg1Sorted = fg1.orderBy(desc("id"), desc("ts")).persist
fg1Sorted.count
val fg2Sorted = fg2.orderBy(desc("id"), desc("ts")).persist
fg2Sorted.count
val fg3Sorted = fg3.orderBy(desc("id"), desc("ts")).persist
fg3Sorted.count

fg1Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res225: Long = 36779
fg2Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res226: Long = 36779
fg3Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res227: Long = 36779


In [143]:
// One feature group

experimentUnion(fg1Sorted, fg2Sorted)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 13640 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 13250 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12769 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12907 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12619 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12734 ms
+---

In [144]:
// Two feature groups

experimentUnion(fg1Sorted, fg2Sorted, fg3Sorted)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 30103 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 30948 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 29802 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 30798 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+--

## 1.3 Pre-partitioned sorted on timestamp

In [132]:
// Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed")

In [139]:
val fg1Bucketed = spark.table("fg1_bucketed").persist()
fg1Bucketed.count
val fg2Bucketed = spark.table("fg2_bucketed").persist()
fg2Bucketed.count
val fg3Bucketed = spark.table("fg3_bucketed").persist()
fg3Bucketed.count

fg1Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res216: Long = 36779
fg2Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res217: Long = 36779
fg3Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res218: Long = 36779


In [145]:
// One Feature group

experimentUnion(fg1Bucketed, fg2Bucketed)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1263 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1356 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1318 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1251 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1285 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 1530 ms
+---+---+-

In [146]:
// Two feature groups

experimentUnion(fg1Bucketed, fg2Bucketed, fg3Bucketed)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 11889 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 11878 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 11982 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 12197 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+--

## 2.4. Pre-partitioned, sorted on id and timestamp

In [83]:
// Parition the data based on id
fg1Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
fg2Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed_sorted")
fg3Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed_sorted")

In [84]:
val fg1SortedBucketed = spark.table("fg1_bucketed_sorted").persist()
fg1SortedBucketed.count
val fg2SortedBucketed = spark.table("fg2_bucketed_sorted").persist()
fg2SortedBucketed.count
val fg3SortedBucketed = spark.table("fg3_bucketed_sorted").persist()
fg3SortedBucketed.count

fg1SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res168: Long = 36779
fg2SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res169: Long = 36779
fg3SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res170: Long = 36779


In [147]:
// One Feature group

experimentUnion(fg1SortedBucketed, fg2SortedBucketed)

An error was encountered:
<console>:59: error: not found: value fg1SortedBucketed
       experimentUnion(fg1SortedBucketed, fg2SortedBucketed)
                       ^
<console>:59: error: not found: value fg2SortedBucketed
       experimentUnion(fg1SortedBucketed, fg2SortedBucketed)
                                          ^



In [148]:
// Two feature groups

experimentUnion(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)

An error was encountered:
<console>:59: error: not found: value fg1SortedBucketed
       experimentUnion(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)
                       ^
<console>:59: error: not found: value fg2SortedBucketed
       experimentUnion(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)
                                          ^
<console>:59: error: not found: value fg3SortedBucketed
       experimentUnion(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)
                                                             ^



# 2. Early stop sort merge

## 2.1 Sorted on timestamps

In [107]:
def earlyStopSortMerge(labelData: DataFrame, fgs: DataFrame*): DataFrame = {
    var joinedData = labelData
    for (fg <- fgs) {
        joinedData = joinedData.join(
            fg,
            EarlyStopSortMerge.pit(labelData("ts"), fg("ts")) && labelData("id") === fg("id")
        )
    }
    joinedData
}

earlyStopSortMerge(fg1, fg2).explain()

earlyStopSortMerge: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)org.apache.spark.sql.DataFrame
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- PITJoin [ts#2], [ts#109], [id#1], [id#108]
   :- Sort [id#1 DESC NULLS LAST, ts#2 DESC NULLS LAST], false, 0
   :  +- Exchange hashpartitioning(id#1, 200), ENSURE_REQUIREMENTS, [id=#22400]
   :     +- Filter isnotnull(id#1)
   :        +- InMemoryTableScan [id#1, ts#2, label#3], [isnotnull(id#1)]
   :              +- InMemoryRelation [id#1, ts#2, label#3], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                    +- *(1) Sort [ts#2 DESC NULLS LAST], true, 0
   :                       +- Exchange rangepartitioning(ts#2 DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#10]
   :                          +- FileScan csv [id#1,ts#2,label#3] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[hdfs://rpc.namenode.service.consul:8020/Projects/demo_fs_meb10000/Jupyter

In [108]:
earlyStopSortMerge(fg1, fg2).show

+------+----+-----+------+----+---+
|    id|  ts|label|    id|  ts| f2|
+------+----+-----+------+----+---+
|100000|1400|   f1|100000|1400| f2|
|100000|1380|   f1|100000|1380| f2|
|100000|1360|   f1|100000|1360| f2|
|100000|1340|   f1|100000|1340| f2|
|100000|1320|   f1|100000|1320| f2|
|100000|1300|   f1|100000|1300| f2|
|100000|1280|   f1|100000|1280| f2|
|100000|1260|   f1|100000|1260| f2|
|100000|1240|   f1|100000|1240| f2|
|100000|1220|   f1|100000|1220| f2|
|100000|1200|   f1|100000|1200| f2|
|100000|1180|   f1|100000|1180| f2|
|100000|1160|   f1|100000|1160| f2|
|100000|1140|   f1|100000|1140| f2|
|100000|1120|   f1|100000|1120| f2|
|100000|1100|   f1|100000|1100| f2|
|100000|1080|   f1|100000|1080| f2|
|100000|1060|   f1|100000|1060| f2|
|100000|1040|   f1|100000|1040| f2|
|100000|1020|   f1|100000|1020| f2|
+------+----+-----+------+----+---+
only showing top 20 rows



In [109]:
spark.time(earlyStopSortMerge(fg1, fg2).count)

Time taken: 3471 ms
res155: Long = 36779


In [112]:
val NO_RUNS = 10

def experimentEarlyStop(labelData: DataFrame, fgs: DataFrame*) : Unit = {
    for (run <- 0 to NO_RUNS) {
        spark.time(earlyStopSortMerge(labelData, fgs :_*).count)
    }
}

NO_RUNS: Int = 10
experimentEarlyStop: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)Unit


In [78]:
// One feature group
experimentEarlyStop(fg1, fg2)

Time taken: 2800 ms
Time taken: 3001 ms
Time taken: 3159 ms
Time taken: 2842 ms
Time taken: 3011 ms
Time taken: 3134 ms
Time taken: 2876 ms
Time taken: 3055 ms
Time taken: 3021 ms
Time taken: 3202 ms
Time taken: 2957 ms


In [79]:
// Two feature groups

experimentEarlyStop(fg1, fg2, fg3)

Time taken: 4279 ms
Time taken: 4687 ms
Time taken: 4280 ms
Time taken: 4377 ms
Time taken: 4710 ms
Time taken: 4444 ms
Time taken: 4444 ms
Time taken: 4740 ms
Time taken: 4359 ms
Time taken: 4361 ms
Time taken: 4691 ms


## 2.2. Sorted on id and timestamp

In [114]:
val fg1Sorted = fg1.orderBy(desc("id"), desc("ts")).persist
fg1Sorted.count
val fg2Sorted = fg2.orderBy(desc("id"), desc("ts")).persist
fg2Sorted.count
val fg3Sorted = fg3.orderBy(desc("id"), desc("ts")).persist
fg3Sorted.count

An error was encountered:
incomplete statement



In [113]:
// One feature group

experimentEarlyStop(fg1Sorted, fg2Sorted)

Time taken: 8659 ms
Time taken: 8757 ms
Time taken: 8641 ms
Time taken: 8462 ms
Time taken: 8406 ms
Time taken: 8406 ms
Time taken: 8489 ms
Time taken: 8444 ms
Time taken: 8475 ms
Time taken: 8302 ms
Time taken: 8752 ms


In [82]:
// Two feature groups

experimentEarlyStop(fg1Sorted, fg2Sorted, fg3Sorted)

Time taken: 13336 ms
Time taken: 14551 ms
Time taken: 14232 ms
Time taken: 13419 ms
Time taken: 12934 ms
Time taken: 13404 ms
Time taken: 13093 ms
Time taken: 13280 ms
Time taken: 13653 ms
Time taken: 13040 ms
Time taken: 13113 ms


## 2.3. Pre-partitioned, sorted on timestamp

In [132]:
// Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed")

In [139]:
val fg1Bucketed = spark.table("fg1_bucketed").persist()
fg1Bucketed.count
val fg2Bucketed = spark.table("fg2_bucketed").persist()
fg2Bucketed.count
val fg3Bucketed = spark.table("fg3_bucketed").persist()
fg3Bucketed.count

fg1Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res216: Long = 36779
fg2Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res217: Long = 36779
fg3Bucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res218: Long = 36779


In [140]:
// One Feature group

experimentEarlyStop(fg1Bucketed, fg2Bucketed)

Time taken: 499 ms
Time taken: 270 ms
Time taken: 267 ms
Time taken: 258 ms
Time taken: 240 ms
Time taken: 236 ms
Time taken: 224 ms
Time taken: 213 ms
Time taken: 328 ms
Time taken: 358 ms
Time taken: 251 ms


In [141]:
// Two feature groups

experimentEarlyStop(fg1Bucketed, fg2Bucketed, fg3Bucketed)

Time taken: 552 ms
Time taken: 356 ms
Time taken: 345 ms
Time taken: 345 ms
Time taken: 336 ms
Time taken: 396 ms
Time taken: 392 ms
Time taken: 310 ms
Time taken: 339 ms
Time taken: 310 ms
Time taken: 334 ms


## 2.4. Pre-partitioned, sorted on id and timestamp

In [83]:
// Parition the data based on id
fg1Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
fg2Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed_sorted")
fg3Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed_sorted")

In [84]:
val fg1SortedBucketed = spark.table("fg1_bucketed_sorted").persist()
fg1SortedBucketed.count
val fg2SortedBucketed = spark.table("fg2_bucketed_sorted").persist()
fg2SortedBucketed.count
val fg3SortedBucketed = spark.table("fg3_bucketed_sorted").persist()
fg3SortedBucketed.count

fg1SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res168: Long = 36779
fg2SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res169: Long = 36779
fg3SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res170: Long = 36779


In [85]:
// One Feature group

experimentEarlyStop(fg1SortedBucketed, fg2SortedBucketed)

Time taken: 262 ms
Time taken: 248 ms
Time taken: 238 ms
Time taken: 249 ms
Time taken: 280 ms
Time taken: 283 ms
Time taken: 257 ms
Time taken: 221 ms
Time taken: 249 ms
Time taken: 234 ms
Time taken: 269 ms


In [86]:
// Two feature groups

experimentEarlyStop(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)

Time taken: 327 ms
Time taken: 281 ms
Time taken: 305 ms
Time taken: 299 ms
Time taken: 277 ms
Time taken: 287 ms
Time taken: 294 ms
Time taken: 312 ms
Time taken: 313 ms
Time taken: 370 ms
Time taken: 285 ms


In [87]:
earlyStopSortMerge(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed).show()

+------+----+-----+------+----+---+------+----+---+
|    id|  ts|label|    id|  ts| f2|    id|  ts| f3|
+------+----+-----+------+----+---+------+----+---+
|100000|1400|   f1|100000|1400| f2|100000|1400| f2|
|100000|1380|   f1|100000|1380| f2|100000|1380| f2|
|100000|1360|   f1|100000|1360| f2|100000|1360| f2|
|100000|1340|   f1|100000|1340| f2|100000|1340| f2|
|100000|1320|   f1|100000|1320| f2|100000|1320| f2|
|100000|1300|   f1|100000|1300| f2|100000|1300| f2|
|100000|1280|   f1|100000|1280| f2|100000|1280| f2|
|100000|1260|   f1|100000|1260| f2|100000|1260| f2|
|100000|1240|   f1|100000|1240| f2|100000|1240| f2|
|100000|1220|   f1|100000|1220| f2|100000|1220| f2|
|100000|1200|   f1|100000|1200| f2|100000|1200| f2|
|100000|1180|   f1|100000|1180| f2|100000|1180| f2|
|100000|1160|   f1|100000|1160| f2|100000|1160| f2|
|100000|1140|   f1|100000|1140| f2|100000|1140| f2|
|100000|1120|   f1|100000|1120| f2|100000|1120| f2|
|100000|1100|   f1|100000|1100| f2|100000|1100| f2|
|100000|1080