# Optimizations for PIT-joins

This notebook will consist of several optimizations for the existing join method. Stuff that will be looked into is the unoptimized PIT-join as well as optimizations.

# 0. Data preparations

In [1]:
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
37,application_1642582607798_0036,spark,idle,Link,Link


SparkSession available as 'spark'.
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import io.hops.util.Hops


In [2]:
import io.github.ackuq.pit.{EarlyStopSortMerge, UnionAsOf}

import io.github.ackuq.pit.{EarlyStopSortMerge, UnionAsOf}


In [3]:
val fg1_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("label", StringType, true)    
))

val fg2_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("f2", StringType, true)    
))

val fg3_schema = StructType(Array(
  StructField("id", IntegerType, true),
  StructField("ts", IntegerType, true),
  StructField("f3", StringType, true)    
))

fg1_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(label,StringType,true))
fg2_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(f2,StringType,true))
fg3_schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(ts,IntegerType,true), StructField(f3,StringType,true))


In [4]:
val spark: SparkSession = SparkSession.builder().master("local").appName("PIT Optimizations Scala").config("spark.sql.adaptive.enabled", true).enableHiveSupport().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@55aea356


In [5]:
spark.sql("use " + Hops.getProjectName)

res3: org.apache.spark.sql.DataFrame = []


In [6]:
EarlyStopSortMerge.init(spark)

In [7]:
val data1 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data2 = Seq(
    Row(1, 4, "1z"),
    Row(1, 5, "1x"),
    Row(2, 6, "2x"),
    Row(1, 7, "1y"),
    Row(2, 8, "2y")
)

val data3 = Seq(
    Row(1, 1, "f3-1-1"),
    Row(2, 2, "f3-2-2"),
    Row(1, 6, "f3-1-6"),
    Row(2, 8, "f3-2-8"),
    Row(1, 10, "f3-1-10")
)

data1: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data2: Seq[org.apache.spark.sql.Row] = List([1,4,1z], [1,5,1x], [2,6,2x], [1,7,1y], [2,8,2y])
data3: Seq[org.apache.spark.sql.Row] = List([1,1,f3-1-1], [2,2,f3-2-2], [1,6,f3-1-6], [2,8,f3-2-8], [1,10,f3-1-10])


In [8]:
/*
val fg1 = spark.createDataFrame(spark.sparkContext.parallelize(data1), schema=fg1_schema) 
val fg2 = spark.createDataFrame(spark.sparkContext.parallelize(data2), schema=fg2_schema) 
val fg3 = spark.createDataFrame(spark.sparkContext.parallelize(data3), schema=fg3_schema) 
*/

In [9]:
val DATA_PATH = "hdfs:///Projects/" + Hops.getProjectName + "/Jupyter/PIT-joins/example-data"


val fg1 = spark.read.option("header", true).schema(fg1_schema).csv(
    DATA_PATH + "/100000-20-1-out.csv"
).sort(desc("ts")).persist()
fg1.count()

val fg2 = spark.read.option("header", true).schema(fg2_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts")).persist()
fg2.count()

val fg3 = spark.read.option("header", true).schema(fg3_schema).csv(
    DATA_PATH + "/100000-20-2-out.csv"
).sort(desc("ts")).persist()
fg3.count()


DATA_PATH: String = hdfs:///Projects/demo_fs_meb10000/Jupyter/PIT-joins/example-data
fg1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res11: Long = 36779
fg2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res13: Long = 36779
fg3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res15: Long = 36779


res2: org.apache.spark.sql.DataFrame = [id: bigint, ts: bigint ... 1 more field]


res2: org.apache.spark.sql.DataFrame = [id: bigint, ts: bigint ... 1 more field]


res2: org.apache.spark.sql.DataFrame = [id: bigint, ts: bigint ... 1 more field]


res2: org.apache.spark.sql.DataFrame = [id: bigint, ts: bigint ... 1 more field]


In [10]:
fg1.show

+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1400|   f1|
|98163|1400|   f1|
|98164|1400|   f1|
|98165|1400|   f1|
|98166|1400|   f1|
|98167|1400|   f1|
|98168|1400|   f1|
|98169|1400|   f1|
|98170|1400|   f1|
|98171|1400|   f1|
|98172|1400|   f1|
|98173|1400|   f1|
|98174|1400|   f1|
|98175|1400|   f1|
|98176|1400|   f1|
|98177|1400|   f1|
|98178|1400|   f1|
|98179|1400|   f1|
|98180|1400|   f1|
|98181|1400|   f1|
+-----+----+-----+
only showing top 20 rows



In [11]:
fg1.createOrReplaceTempView("left")
fg2.createOrReplaceTempView("right")

# 1. Union PIT Join

## 1.1. Sorted on timestamps (descending)

In [12]:
def unionPitJoin(labelData: DataFrame, fgs: DataFrame*) : DataFrame = {
    var joinedData = labelData
    for ((fg, i) <- fgs.zipWithIndex) {
        val num = i + 2
        joinedData = UnionAsOf.join(
            joinedData,
            fg,
            rightPrefix = s"fg${num}_",
            partitionCols = Seq("id")
        )
    }
    
    joinedData
}

unionPitJoin(fg1, fg2, fg3).show

unionPitJoin: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)org.apache.spark.sql.DataFrame
+-----+----+-----+------+------+------+------+
|   id|  ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+-----+----+-----+------+------+------+------+
|98164|1020|   f1|  1020|    f2|  1020|    f2|
|98164|1040|   f1|  1040|    f2|  1040|    f2|
|98164|1060|   f1|  1060|    f2|  1060|    f2|
|98164|1080|   f1|  1080|    f2|  1080|    f2|
|98164|1100|   f1|  1100|    f2|  1100|    f2|
|98164|1120|   f1|  1120|    f2|  1120|    f2|
|98164|1140|   f1|  1140|    f2|  1140|    f2|
|98164|1160|   f1|  1160|    f2|  1160|    f2|
|98164|1180|   f1|  1180|    f2|  1180|    f2|
|98164|1200|   f1|  1200|    f2|  1200|    f2|
|98164|1220|   f1|  1220|    f2|  1220|    f2|
|98164|1240|   f1|  1240|    f2|  1240|    f2|
|98164|1260|   f1|  1260|    f2|  1260|    f2|
|98164|1280|   f1|  1280|    f2|  1280|    f2|
|98164|1300|   f1|  1300|    f2|  1300|    f2|
|98164|1320|   f1|  1320|   

In [13]:
unionPitJoin(fg1, fg2).explain(extended=true)

== Parsed Logical Plan ==
Project [id#944, ts#2, label#3, fg2_ts#953, fg2_f2#962]
+- Project [id#944, ts#2, label#3, fg2_ts#953, fg2_f2#962, df_combined_ts#935]
   +- Filter isnotnull(ts#2)
      +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, fg2_f2#962, df_combined_ts#935]
         +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, df_combined_ts#935, fg2_f2#926, fg2_f2#962, fg2_f2#962]
            +- Window [last(fg2_f2#926, true) windowspecdefinition(id#944, df_combined_ts#935 ASC NULLS FIRST, df_index#889 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_f2#962], [id#944], [df_combined_ts#935 ASC NULLS FIRST, df_index#889 ASC NULLS FIRST]
               +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, df_combined_ts#935, fg2_f2#926]
                  +- Project [id#944, ts#2, label#3, df_index#889, fg2_ts#953, fg2_f2#926, df_combined_ts#935]
                     +- Project [id#944, ts#2, label#3, df_ind

In [14]:
unionPitJoin(fg1, fg2).show

+-----+----+-----+------+------+
|   id|  ts|label|fg2_ts|fg2_f2|
+-----+----+-----+------+------+
|98164|1020|   f1|  1020|    f2|
|98164|1040|   f1|  1040|    f2|
|98164|1060|   f1|  1060|    f2|
|98164|1080|   f1|  1080|    f2|
|98164|1100|   f1|  1100|    f2|
|98164|1120|   f1|  1120|    f2|
|98164|1140|   f1|  1140|    f2|
|98164|1160|   f1|  1160|    f2|
|98164|1180|   f1|  1180|    f2|
|98164|1200|   f1|  1200|    f2|
|98164|1220|   f1|  1220|    f2|
|98164|1240|   f1|  1240|    f2|
|98164|1260|   f1|  1260|    f2|
|98164|1280|   f1|  1280|    f2|
|98164|1300|   f1|  1300|    f2|
|98164|1320|   f1|  1320|    f2|
|98164|1340|   f1|  1340|    f2|
|98164|1360|   f1|  1360|    f2|
|98164|1380|   f1|  1380|    f2|
|98164|1400|   f1|  1400|    f2|
+-----+----+-----+------+------+
only showing top 20 rows



In [15]:
val query = unionPitJoin(fg1, fg2)
query.count
query.explain

query: org.apache.spark.sql.DataFrame = [id: int, ts: int ... 3 more fields]
res23: Long = 36779
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [id#1401, ts#2, label#3, fg2_ts#1410, fg2_f2#1419]
   +- Filter isnotnull(ts#2)
      +- Window [last(fg2_f2#1383, true) windowspecdefinition(id#1401, df_combined_ts#1392 ASC NULLS FIRST, df_index#1346 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_f2#1419], [id#1401], [df_combined_ts#1392 ASC NULLS FIRST, df_index#1346 ASC NULLS FIRST]
         +- Project [id#1401, ts#2, label#3, df_index#1346, fg2_ts#1410, df_combined_ts#1392, fg2_f2#1383]
            +- Window [last(fg2_ts#1382, true) windowspecdefinition(id#1401, df_combined_ts#1392 ASC NULLS FIRST, df_index#1346 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS fg2_ts#1410], [id#1401], [df_combined_ts#1392 ASC NULLS FIRST, df_index#1346 ASC NULLS FIRST]
               +- Sort [id#1401

In [16]:
val NO_RUNS = 10

def experimentUnion(labelData: DataFrame, fgs: DataFrame*) : Unit = {
    for (run <- 1 to NO_RUNS) {
        spark.time(unionPitJoin(labelData, fgs :_*).show(0))
    }
}


NO_RUNS: Int = 10
experimentUnion: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)Unit


In [17]:
unionPitJoin(fg1, fg2).show(0)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows



In [18]:
unionPitJoin(fg1, fg2, fg3).show(0)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows



In [19]:
// One Feature group

experimentUnion(fg1, fg2)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2424 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2318 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2162 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2305 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2137 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 2303 ms
+---+---+-

In [20]:
// Two feature groups

experimentUnion(fg1, fg2, fg3)

+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 14541 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 14609 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 14026 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+------+------+------+
+---+---+-----+------+------+------+------+
only showing top 0 rows

Time taken: 14315 ms
+---+---+-----+------+------+------+------+
| id| ts|label|fg2_ts|fg2_f2|fg3_ts|fg3_f3|
+---+---+-----+------+--

## 1.2. Sorted on id and timestamp

In [21]:
val fg1Sorted = fg1.orderBy(desc("id"), desc("ts")).persist
fg1Sorted.count
val fg2Sorted = fg2.orderBy(desc("id"), desc("ts")).persist
fg2Sorted.count
val fg3Sorted = fg3.orderBy(desc("id"), desc("ts")).persist
fg3Sorted.count

fg1Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res34: Long = 36779
fg2Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res35: Long = 36779
fg3Sorted: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res36: Long = 36779


In [22]:
// One feature group

experimentUnion(fg1Sorted, fg2Sorted)

+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12698 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12281 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12103 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12481 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 12048 ms
+---+---+-----+------+------+
| id| ts|label|fg2_ts|fg2_f2|
+---+---+-----+------+------+
+---+---+-----+------+------+
only showing top 0 rows

Time taken: 11798 ms
+---

In [23]:
// Two feature groups

experimentUnion(fg1Sorted, fg2Sorted, fg3Sorted)

KeyboardInterrupt: 

## 1.3 Bucketed sorted on timestamp

In [None]:
// Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed")

In [None]:
val fg1Bucketed = spark.table("fg1_bucketed").persist()
fg1Bucketed.count
val fg2Bucketed = spark.table("fg2_bucketed").persist()
fg2Bucketed.count
val fg3Bucketed = spark.table("fg3_bucketed").persist()
fg3Bucketed.count

In [None]:
// One Feature group

experimentUnion(fg1Bucketed, fg2Bucketed)

In [None]:
// Two feature groups

experimentUnion(fg1Bucketed, fg2Bucketed, fg3Bucketed)

## 1.4. Bucketed, sorted on id and timestamp

In [None]:
// Parition the data based on id
fg1Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
fg2Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed_sorted")
fg3Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed_sorted")

In [None]:
val fg1SortedBucketed = spark.table("fg1_bucketed_sorted").persist()
fg1SortedBucketed.count
val fg2SortedBucketed = spark.table("fg2_bucketed_sorted").persist()
fg2SortedBucketed.count
val fg3SortedBucketed = spark.table("fg3_bucketed_sorted").persist()
fg3SortedBucketed.count

In [None]:
// One Feature group

experimentUnion(fg1SortedBucketed, fg2SortedBucketed)

In [None]:
// Two feature groups

experimentUnion(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)

# 2. Early stop sort merge

## 2.1 Sorted on timestamps

In [33]:
def earlyStopSortMerge(labelData: DataFrame, fgs: DataFrame*): DataFrame = {
    var joinedData = labelData
    for (fg <- fgs) {
        joinedData = joinedData.join(
            fg,
            EarlyStopSortMerge.pit(labelData("ts"), fg("ts")) && labelData("id") === fg("id")
        )
    }
    joinedData
}

earlyStopSortMerge(fg1, fg2).explain()

earlyStopSortMerge: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)org.apache.spark.sql.DataFrame
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- PITJoin [ts#2], [ts#109], [id#1], [id#108]
   :- Sort [id#1 DESC NULLS LAST, ts#2 DESC NULLS LAST], false, 0
   :  +- Exchange hashpartitioning(id#1, 200), ENSURE_REQUIREMENTS, [id=#31054]
   :     +- Filter isnotnull(id#1)
   :        +- InMemoryTableScan [id#1, ts#2, label#3], [isnotnull(id#1)]
   :              +- InMemoryRelation [id#1, ts#2, label#3], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                    +- *(1) Sort [ts#2 DESC NULLS LAST], true, 0
   :                       +- Exchange rangepartitioning(ts#2 DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#10]
   :                          +- FileScan csv [id#1,ts#2,label#3] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[hdfs://rpc.namenode.service.consul:8020/Projects/demo_fs_meb10000/Jupyter

In [34]:
earlyStopSortMerge(fg1, fg2).show

+------+----+-----+------+----+---+
|    id|  ts|label|    id|  ts| f2|
+------+----+-----+------+----+---+
|100000|1400|   f1|100000|1400| f2|
|100000|1380|   f1|100000|1380| f2|
|100000|1360|   f1|100000|1360| f2|
|100000|1340|   f1|100000|1340| f2|
|100000|1320|   f1|100000|1320| f2|
|100000|1300|   f1|100000|1300| f2|
|100000|1280|   f1|100000|1280| f2|
|100000|1260|   f1|100000|1260| f2|
|100000|1240|   f1|100000|1240| f2|
|100000|1220|   f1|100000|1220| f2|
|100000|1200|   f1|100000|1200| f2|
|100000|1180|   f1|100000|1180| f2|
|100000|1160|   f1|100000|1160| f2|
|100000|1140|   f1|100000|1140| f2|
|100000|1120|   f1|100000|1120| f2|
|100000|1100|   f1|100000|1100| f2|
|100000|1080|   f1|100000|1080| f2|
|100000|1060|   f1|100000|1060| f2|
|100000|1040|   f1|100000|1040| f2|
|100000|1020|   f1|100000|1020| f2|
+------+----+-----+------+----+---+
only showing top 20 rows



In [35]:
spark.time(earlyStopSortMerge(fg1, fg2).count)

Time taken: 3690 ms
res61: Long = 36779


In [36]:
val NO_RUNS = 10

def experimentEarlyStop(labelData: DataFrame, fgs: DataFrame*) : Unit = {
    for (run <- 0 to NO_RUNS) {
        spark.time(earlyStopSortMerge(labelData, fgs :_*).count)
    }
}

NO_RUNS: Int = 10
experimentEarlyStop: (labelData: org.apache.spark.sql.DataFrame, fgs: org.apache.spark.sql.DataFrame*)Unit


In [None]:
// One feature group
experimentEarlyStop(fg1, fg2)

In [None]:
// Two feature groups

experimentEarlyStop(fg1, fg2, fg3)

## 2.2. Sorted on id and timestamp

In [None]:
val fg1Sorted = fg1.orderBy(desc("id"), desc("ts")).persist
fg1Sorted.count
val fg2Sorted = fg2.orderBy(desc("id"), desc("ts")).persist
fg2Sorted.count
val fg3Sorted = fg3.orderBy(desc("id"), desc("ts")).persist
fg3Sorted.count

In [None]:
// One feature group

experimentEarlyStop(fg1Sorted, fg2Sorted)

In [None]:
// Two feature groups

experimentEarlyStop(fg1Sorted, fg2Sorted, fg3Sorted)

## 2.3. Bucketed, sorted on timestamp

In [None]:
// Parition the data based on id
fg1.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed")
fg2.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed")
fg3.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed")

In [None]:
val fg1Bucketed = spark.table("fg1_bucketed").persist()
fg1Bucketed.count
val fg2Bucketed = spark.table("fg2_bucketed").persist()
fg2Bucketed.count
val fg3Bucketed = spark.table("fg3_bucketed").persist()
fg3Bucketed.count

In [None]:
// One Feature group

experimentEarlyStop(fg1Bucketed, fg2Bucketed)

In [None]:
// Two feature groups

experimentEarlyStop(fg1Bucketed, fg2Bucketed, fg3Bucketed)

## 2.4. Bucketed, sorted on id and timestamp

In [28]:
// Parition the data based on id
fg1Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg1_bucketed_sorted")
fg2Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg2_bucketed_sorted")
fg3Sorted.write.mode("overwrite").bucketBy(4, "id").saveAsTable("fg3_bucketed_sorted")

In [37]:
val fg1SortedBucketed = spark.table("fg1_bucketed_sorted").persist()
fg1SortedBucketed.count
val fg2SortedBucketed = spark.table("fg2_bucketed_sorted").persist()
fg2SortedBucketed.count
val fg3SortedBucketed = spark.table("fg3_bucketed_sorted").persist()
fg3SortedBucketed.count

fg1SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res63: Long = 36779
fg2SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res64: Long = 36779
fg3SortedBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, ts: int ... 1 more field]
res65: Long = 36779


In [38]:
// One Feature group

experimentEarlyStop(fg1SortedBucketed, fg2SortedBucketed)

Time taken: 386 ms
Time taken: 263 ms
Time taken: 254 ms
Time taken: 265 ms
Time taken: 242 ms
Time taken: 236 ms
Time taken: 254 ms
Time taken: 269 ms
Time taken: 237 ms
Time taken: 210 ms
Time taken: 229 ms


In [39]:
// Two feature groups

experimentEarlyStop(fg1SortedBucketed, fg2SortedBucketed, fg3SortedBucketed)

Time taken: 1241 ms
Time taken: 303 ms
Time taken: 370 ms
Time taken: 460 ms
Time taken: 444 ms
Time taken: 330 ms
Time taken: 296 ms
Time taken: 322 ms
Time taken: 312 ms
Time taken: 314 ms
Time taken: 334 ms


## 2.5 Pre-partitioned, sorted on id and timestamp

In [40]:
// Parition the data based on id
fg1Sorted.write.mode("overwrite").partitionBy("id").saveAsTable("fg1_partitioned_sorted")
fg2Sorted.write.mode("overwrite").partitionBy("id").saveAsTable("fg2_partitioned_sorted")
fg3Sorted.write.mode("overwrite").partitionBy("id").saveAsTable("fg3_partitioned_sorted")

In [41]:
val fg1SortedPartitioned = spark.table("fg1_partitioned_sorted").persist()
fg1SortedPartitioned.count
val fg2SortedPartitioned = spark.table("fg2_partitioned_sorted").persist()
fg2SortedPartitioned.count
val fg3SortedPartitioned = spark.table("fg3_partitioned_sorted").persist()
fg3SortedPartitioned.count

fg1SortedPartitioned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [ts: int, label: string ... 1 more field]
res76: Long = 36779
fg2SortedPartitioned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [ts: int, f2: string ... 1 more field]
res77: Long = 36779
fg3SortedPartitioned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [ts: int, f3: string ... 1 more field]
res78: Long = 36779


In [44]:
// Two feature groups

experimentEarlyStop(fg1SortedPartitioned, fg2SortedPartitioned)

Time taken: 3883 ms
Time taken: 3696 ms
Time taken: 3538 ms
Time taken: 3482 ms
Time taken: 3958 ms
Time taken: 3608 ms
Time taken: 3436 ms
Time taken: 3628 ms
Time taken: 3549 ms
Time taken: 3425 ms
Time taken: 3592 ms


In [None]:
// Two feature groups

experimentEarlyStop(fg1SortedPartitioned, fg2SortedPartitioned, fg3SortedPartitioned)

Time taken: 5573 ms
Time taken: 5241 ms
Time taken: 5370 ms
Time taken: 5392 ms
Time taken: 5251 ms
Time taken: 5283 ms
Time taken: 5480 ms
Time taken: 5116 ms
Time taken: 5528 ms
Time taken: 5224 ms
Time taken: 5345 ms
