In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("Joins")
    .master("yarn")     
    .getOrCreate())

# Read big table

In [3]:
order_items = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("sep", ",") \
.csv("/user/train/datasets/retail_db/order_items.csv")

In [4]:
order_items.count()

172198

In [5]:
order_items.limit(3).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice
0,1,1,957,1,299.98,299.98
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0


# Read Small Table

In [6]:
products = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("sep", ",") \
.csv("/user/train/datasets/retail_db/products.csv")

In [7]:
products.count()

1345

In [8]:
products.limit(3).toPandas()

Unnamed: 0,productId,productCategoryId,productName,productDescription,productPrice,productImage
0,1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+F...
1,2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+M...
2,3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+M...


# join

In [9]:
spark.conf.get("spark.sql.autoBroadcastJoinThreshold")

'10485760b'

In [10]:
# 10485760b= 10 mb

In [11]:
bcast_join_df = order_items.join(F.broadcast(products), order_items.orderItemProductId == products.productId)

In [12]:
bcast_join_df.limit(3).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice,productId,productCategoryId,productName,productDescription,productPrice,productImage
0,1,1,957,1,299.98,299.98,957,43,Diamondback Women's Serene Classic Comfort Bi,,299.98,http://images.acmesports.sports/Diamondback+Wo...
1,2,2,1073,1,199.99,199.99,1073,48,Pelican Sunstream 100 Kayak,,199.99,http://images.acmesports.sports/Pelican+Sunstr...
2,3,2,502,5,250.0,50.0,502,24,Nike Men's Dri-FIT Victory Golf Polo,,50.0,http://images.acmesports.sports/Nike+Men%27s+D...


In [13]:
bcast_join_df.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [orderItemProductId#18], [productId#61], Inner, BuildRight
:- *(2) Project [orderItemName#16, orderItemOrderId#17, orderItemProductId#18, orderItemQuantity#19, orderItemSubTotal#20, orderItemProductPrice#21]
:  +- *(2) Filter isnotnull(orderItemProductId#18)
:     +- FileScan csv [orderItemName#16,orderItemOrderId#17,orderItemProductId#18,orderItemQuantity#19,orderItemSubTotal#20,orderItemProductPrice#21] Batched: false, DataFilters: [isnotnull(orderItemProductId#18)], Format: CSV, Location: InMemoryFileIndex[hdfs://localhost:9000/user/train/datasets/retail_db/order_items.csv], PartitionFilters: [], PushedFilters: [IsNotNull(orderItemProductId)], ReadSchema: struct<orderItemName:int,orderItemOrderId:int,orderItemProductId:int,orderItemQuantity:int,orderI...
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#153]
   +- *(1) Project [productId#61, productCategoryId#62, productName#63, productDescrip

In [14]:
spark.stop()