# Optimizations for PIT-joins

This notebook will consist of several optimizations for the existing join method. Stuff that will be looked into is the unoptimized PIT-join as well as optimizations.

## 0. Data preparations

In [1]:
# Imports
import datetime
from pyspark.sql import DataFrame, Row, SparkSession, Window, SQLContext
from pyspark.sql.types import StructField, IntegerType, StringType, StructType
from pyspark.sql.functions import unix_timestamp, from_unixtime, col
from pyspark.sql import functions as F
from hops import hdfs as hdfs

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
11,application_1642582607798_0010,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
data1 = [[1, 5, "1x"],
         [1, 7, "1y"],
         [1, 4, "1z"],
         [2, 6, "2x"],
         [2, 8, "2y"]]

data2 = [[1, 5, "1x"],
         [1, 7, "1y"],
         [1, 4, "1z"],
         [2, 6, "2x"],
         [2, 8, "2y"]]

data3 = [[1, 10, "f3-1-10"],
         [1, 1, "f3-1-1"],
         [1, 6, "f3-1-6"],
         [2, 2, "f3-2-2"],
         [2, 8, "f3-2-8"]]


In [34]:
spark = SparkSession.builder.master("local[2]").appName("Three Way PIT").config("spark.some.config.option", "some-value").getOrCreate()
sql_context = SQLContext(spark)

In [52]:
fg1_schema = StructType([
  StructField("id", IntegerType(), False),
  StructField("ts", IntegerType(), False),
  StructField("label", StringType(), False)    
])

fg2_schema = StructType([
  StructField("id_2", IntegerType(), False),
  StructField("ts_2", IntegerType(), False),
  StructField("f2", StringType(), False)
])

fg3_schema = StructType([
  StructField("id_3", IntegerType(), False),
  StructField("ts_3", IntegerType(), False),
  StructField("f3", StringType(), False)
])

In [53]:
# fg1 = spark.createDataFrame(data1, schema=fg1_schema) 
# fg2 = spark.createDataFrame(data2, schema=fg2_schema) 
# fg3 = spark.createDataFrame(data3, schema=fg3_schema) 

In [65]:
DATA_PATH = "hdfs:///Projects/" + hdfs.project_name() + "/Jupyter/PIT-joins/example-data"

fg1 = spark.read.csv(
    DATA_PATH + "/100000-20-1-out.csv", header=True, schema=fg1_schema
).persist()
fg1.count()

fg2 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg2_schema
).persist()
fg2.count()


fg3 = spark.read.csv(
    DATA_PATH + "/100000-20-2-out.csv", header=True, schema=fg3_schema
).persist()
fg3.count()


36779

## 1. Regular PIT-join

In [55]:
# One feature group

# 1. Join the data
joined_data = fg1.hint("BROADCASTJOIN").join(
    fg2, (fg1.id == fg2.id_2) & (fg1.ts >= fg2.ts_2)
).unpersist()

# 2. Create window for partitioning and ordering the data
win = Window.partitionBy(["id", "ts"]).orderBy(F.desc("ts_2"))

# 3. Rank the rows of each partition
ranked_data = joined_data.withColumn("rank", F.rank().over(win)).unpersist()

# 4. Take only the columns with rank == 1, for each partition
filtered_data = ranked_data.filter(F.col("rank") == 1).unpersist()

filtered_data.count()


36779

In [71]:
# Two feature groups

# 1. Join the data
joined_data = fg1.hint("BROADCAST").join(
    fg2, (fg1.id == fg2.id_2) & (fg1.ts >= fg2.ts_2)).unpersist().hint("BROADCAST").join(
        fg3, (fg1.id == fg3.id_3) & (fg1.ts >= fg3.ts_3)
).unpersist()

# 2. Create window for partitioning and ordering the data
win = Window.partitionBy(["id", "ts"]).orderBy(F.desc("ts_2"), F.desc("ts_3"))

# 3. Rank the rows of each partition
ranked_data = joined_data.withColumn("rank", F.rank().over(win)).unpersist()

# 4. Take only the columns with rank == 1, for each partition
filtered_data = ranked_data.filter(F.col("rank") == 1).unpersist()

filtered_data.count()

36779

# 2. Pre-sorted

In [74]:
# Simulate pre-sorting of data
sorted_fg1 = fg1.orderBy(F.asc("id")).persist()
sorted_fg2 = fg2.orderBy(F.asc("id_2")).persist()
sorted_fg3 = fg3.orderBy(F.asc("id_3")).persist()

sorted_fg1.count()
sorted_fg2.count()
sorted_fg3.count()

36779

In [75]:
sorted_fg1.show()

+-----+----+-----+
|   id|  ts|label|
+-----+----+-----+
|98162|1040|   f1|
|98162|1060|   f1|
|98162|1080|   f1|
|98162|1100|   f1|
|98162|1120|   f1|
|98162|1140|   f1|
|98162|1160|   f1|
|98162|1180|   f1|
|98162|1200|   f1|
|98162|1220|   f1|
|98162|1240|   f1|
|98162|1260|   f1|
|98162|1280|   f1|
|98162|1300|   f1|
|98162|1320|   f1|
|98162|1340|   f1|
|98162|1360|   f1|
|98162|1380|   f1|
|98162|1400|   f1|
|98163|1020|   f1|
+-----+----+-----+
only showing top 20 rows

In [76]:
# One feature group

# 1. Join the data
joined_data = fg1.hint("MERGE").join(
    fg2, (fg1.id == fg2.id_2) & (fg1.ts >= fg2.ts_2)
).unpersist()

# 2. Create window for partitioning and ordering the data
win = Window.partitionBy(["id", "ts"]).orderBy(F.desc("ts_2"))

# 3. Rank the rows of each partition
ranked_data = joined_data.withColumn("rank", F.rank().over(win)).unpersist()

# 4. Take only the columns with rank == 1, for each partition
filtered_data = ranked_data.filter(F.col("rank") == 1).unpersist()

filtered_data.count()

An error was encountered:
invalid syntax (<stdin>, line 12)
  File "<stdin>", line 12
    ranked_data = joined_data.withColumn("rank", F.rank().over(win))unpersist()
                                                                    ^
SyntaxError: invalid syntax



Preliminary results: Decrease from 9s to 6s.