In [0]:
# link to storage account
storage_account_name = "joindatasets"
container_name = "datasets"
blob_url = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"

strorage_account_key= dbutils.secrets.get(scope = "key-vault-secret", key = "blob-datasets-accesskey")
spark.conf.set(
    "fs.azure.account.key."+storage_account_name+".blob.core.windows.net",
    strorage_account_key)


In [0]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.join.preferSortMergeJoin", True)
spark.conf.set("spark.databricks.io.cache.enabled", False)
spark.conf.set("spark.sql.adaptive.localShuffleReader.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [0]:

# Read CSV from Azure Blob Storage
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

csv_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/department_data.csv"


#### Join Big and Small table - SortMerge vs BroadCast Join

In [0]:
# Read EMP CSV data

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv") \
    .schema(_schema) \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load(f"{blob_url}/employee_records.csv")


In [0]:
# Read DEPT CSV data

_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv") \
    .schema(_dept_schema) \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load(f"{blob_url}/department_data.csv")

Join in noop format for performance benchmarking

In [0]:
# Join both datasets
sc.setJobDescription("A: SortMerge Join")
df_joined = emp.join(dept, on=emp.department_id==dept.department_id, how="left_outer")

# df_joined.explain()
# Write data in noop format for performance beachmarking
df_joined.write.format("noop").mode("overwrite").save()

In [0]:
df_joined.explain()

In [0]:
# Join Datasets
from pyspark.sql.functions import broadcast

df_joined = emp.join(broadcast(dept), on=emp.department_id==dept.department_id, how="left_outer")

In [0]:
sc.setJobDescription("B: Broadcast join")
df_joined.write.format("noop").mode("overwrite").save()

In [0]:
df_joined.explain()

#### Join Big and Big table - SortMerge without Buckets

In [0]:
# Read Sales data

sales_schema = "transacted_at string, trx_id string, retailer_id string, description string, amount double, city_id string"

sales = spark.read.format("csv") \
    .schema(sales_schema) \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load(f"{blob_url}/new_sales.csv")

In [0]:
# Read City data

city_schema = "city_id string, city string, state string, state_abv string, country string"

city = spark.read.format("csv") \
    .schema(city_schema) \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load(f"{blob_url}/cities.csv")

In [0]:
# Join Data
sc.setJobDescription("C: SortMerge Big Tables")
df_sales_joined = sales.join(city, on=sales.city_id==city.city_id, how="left_outer")
df_sales_joined.write.format("noop").mode("overwrite").save()

In [0]:
# Explain Plan
df_sales_joined.explain()


##### Write Sales and City data in Buckets

In [0]:
%sql
DROP TABLE IF EXISTS sales_bucket;
DROP TABLE IF EXISTS city_bucket;

In [0]:
# Write Sales data in Buckets
sc.setJobDescription("D: create BucketLargeTables")
sales.write.format("csv").mode("overwrite").bucketBy(4, "city_id").option("header", True).option("path", "/data/input/datasets/sales_bucket.csv").saveAsTable("sales_bucket")
city.write.format("csv").mode("overwrite").bucketBy(4, "city_id").option("header", True).option("path", "/data/input/datasets/city_bucket.csv").saveAsTable("city_bucket")

#### Join Sales and City data - SortMerge with Bucket

In [0]:
# Join datasets
sc.setJobDescription("Db: join BucketLargeTables")

#read bucket data
sales_bucket = spark.read.table("sales_bucket")
city_bucket = spark.read.table("city_bucket")

#create join
df_joined_bucket = sales_bucket.join(city_bucket, on=sales_bucket.city_id==city_bucket.city_id, how="left_outer")

#write data
df_joined_bucket.write.format("noop").mode("overwrite").save()

In [0]:
df_joined_bucket.explain()

#### Points to note

1. Joining Column different than Bucket Column, Same Bucket Size - Shuffle on Both table
2. Joining Column Same, One table in Bucket - Shuffle on non Bucket table
3. Joining Column Same, Different Bucket Size - Shuffle on Smaller Bucket Side
4. Joining Column Same, Same Bucket Size - No Shuffle (Faster Join)

1. So its very importatant to choose correct Bucket column and Bucket Size
2. Decide effectively on number of Buckets, as too mant buckets with not enough data can lead to Small file issue.
3. Datasets are Small - you can prefer Shuffle Hash Join

Suffle Hash Join
Steps:
	- Both ds will be shuffled among the executors based on the key column
	- The smaller ds will be hashed in each executor in order to match with hashed  key of bigger ds
	- Once hashed key match, joining works

Points:
	- Support al (equal) joins except full outer join
	- No sort steps, thus keys are not sorted
	- Suitable for joins where one smaller dataset which can fit in memory
	- To make sure spark used shuffle hash joins, we need to set:
		spark.sql.join.preferSortMergeJoin=false
		
Sort merge join
Steps:
	- Both ds will be shuffled among the executors based on the key column
	- The joining keys from both datasets will be sorted in same order
	- Once the joining key are sorted the merging happens (thus it sort merge)

Points:
	- It supports all join types including full outer join
	- Since there is a sort step, it can be expensive join if not optimized properly
	- Preferred when we have two big datasets to join
	- We can set to use sort merge joins
		spark.sql.join.preferSortMergeJoin=true
