In [1]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.getOrCreate() # 1

## 1. Join Types

In [2]:
person = spark.createDataFrame([
      (0, "Bill Chambers", 0, [100]),
      (1, "Matei Zaharia", 1, [500, 250, 100]),
      (2, "Michael Armbrust", 1, [250, 100])])\
    .toDF("id", "name", "graduate_program", "spark_status")
graduateProgram = spark.createDataFrame([
      (0, "Masters", "School of Information", "UC Berkeley"),
      (2, "Masters", "EECS", "UC Berkeley"),
      (1, "Ph.D.", "EECS", "UC Berkeley")])\
    .toDF("id", "degree", "department", "school")
sparkStatus = spark.createDataFrame([
      (500, "Vice President"),
      (250, "PMC Member"),
      (100, "Contributor")])\
.toDF("id", "status")

In [3]:
person.toPandas()

Unnamed: 0,id,name,graduate_program,spark_status
0,0,Bill Chambers,0,[100]
1,1,Matei Zaharia,1,"[500, 250, 100]"
2,2,Michael Armbrust,1,"[250, 100]"


In [4]:
graduateProgram.toPandas()

Unnamed: 0,id,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,2,Masters,EECS,UC Berkeley
2,1,Ph.D.,EECS,UC Berkeley


In [5]:
sparkStatus.toPandas()

Unnamed: 0,id,status
0,500,Vice President
1,250,PMC Member
2,100,Contributor


### Inner join

In [6]:
joinExpression = person["graduate_program"] == graduateProgram['id'] ## FK임
wrongJoinExpression = person["name"] == graduateProgram["school"] ## 엉뚱쓰

In [7]:
person.join(graduateProgram, joinExpression).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,id.1,degree,department,school
0,0,Bill Chambers,0,[100],0,Masters,School of Information,UC Berkeley
1,1,Matei Zaharia,1,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
2,2,Michael Armbrust,1,"[250, 100]",1,Ph.D.,EECS,UC Berkeley


In [8]:
## type 명시도 가능
joinType = "inner"
person.join(graduateProgram, joinExpression, joinType).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,id.1,degree,department,school
0,0,Bill Chambers,0,[100],0,Masters,School of Information,UC Berkeley
1,1,Matei Zaharia,1,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
2,2,Michael Armbrust,1,"[250, 100]",1,Ph.D.,EECS,UC Berkeley


### Outer join

In [9]:
joinType = "outer"
person.join(graduateProgram, joinExpression, joinType).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,id.1,degree,department,school
0,0.0,Bill Chambers,0.0,[100],0,Masters,School of Information,UC Berkeley
1,1.0,Matei Zaharia,1.0,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
2,2.0,Michael Armbrust,1.0,"[250, 100]",1,Ph.D.,EECS,UC Berkeley
3,,,,,2,Masters,EECS,UC Berkeley


### Left Outer Joins

In [10]:
joinType = "left_outer"
graduateProgram.join(person, joinExpression, joinType).toPandas()

Unnamed: 0,id,degree,department,school,id.1,name,graduate_program,spark_status
0,0,Masters,School of Information,UC Berkeley,0.0,Bill Chambers,0.0,[100]
1,2,Masters,EECS,UC Berkeley,,,,
2,1,Ph.D.,EECS,UC Berkeley,2.0,Michael Armbrust,1.0,"[250, 100]"
3,1,Ph.D.,EECS,UC Berkeley,1.0,Matei Zaharia,1.0,"[500, 250, 100]"


### Right Outer Joins

In [11]:
## 위와 같음
joinType = "right_outer"
person.join(graduateProgram, joinExpression, joinType).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,id.1,degree,department,school
0,0.0,Bill Chambers,0.0,[100],0,Masters,School of Information,UC Berkeley
1,,,,,2,Masters,EECS,UC Berkeley
2,2.0,Michael Armbrust,1.0,"[250, 100]",1,Ph.D.,EECS,UC Berkeley
3,1.0,Matei Zaharia,1.0,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley


### Left Semi Joins
- 오른쪽 데이터 프레임의 값은 포함하지 않는다
- 두 번째 데이터 프레임에 값이 있는지 확인하기 위해 비교

In [12]:
joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType).toPandas() ## graduate로 비교

Unnamed: 0,id,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,1,Ph.D.,EECS,UC Berkeley


In [13]:
graduateProgram.toPandas()

Unnamed: 0,id,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,2,Masters,EECS,UC Berkeley
2,1,Ph.D.,EECS,UC Berkeley


In [14]:
person.toPandas() ## graduate_program이 여기 0, 1만 있다

Unnamed: 0,id,name,graduate_program,spark_status
0,0,Bill Chambers,0,[100]
1,1,Matei Zaharia,1,"[500, 250, 100]"
2,2,Michael Armbrust,1,"[250, 100]"


In [15]:
gradProgram2 = graduateProgram.union(spark.createDataFrame([
  (0, "Masters", "Duplicated Row", "Duplicated School")]))
gradProgram2.createOrReplaceTempView("gradProgram2")
gradProgram2.toPandas()

Unnamed: 0,id,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,2,Masters,EECS,UC Berkeley
2,1,Ph.D.,EECS,UC Berkeley
3,0,Masters,Duplicated Row,Duplicated School


In [16]:
## graduate_program이 여기 0, 1만 있다. dupl은 상관하지 않는다.
gradProgram2.join(person, joinExpression, joinType).toPandas()

Unnamed: 0,id,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,1,Ph.D.,EECS,UC Berkeley
2,0,Masters,Duplicated Row,Duplicated School


### Left Anti Joins
- semi와 비슷하지만 거꾸로, 오른쪽 데이터 프레임에 값이 없는 경우만 살린다

In [17]:
joinType = "left_anti"
graduateProgram.join(person, joinExpression, joinType).toPandas()

Unnamed: 0,id,degree,department,school
0,2,Masters,EECS,UC Berkeley


### Natural Joins
- 키가 될 열을 암묵Implicit적으로 추측
- 암묵Implicit적인 건 항상 위험하니 **조심**. 아래는 틀린 Join expression
```SELECT * FROM graduateProgram NATURAL JOIN person```

In [18]:
graduateProgram.createOrReplaceTempView("graduateProgram")
person.createOrReplaceTempView("person")
spark.sql("SELECT * FROM graduateProgram NATURAL JOIN person").toPandas()

Unnamed: 0,id,degree,department,school,name,graduate_program,spark_status
0,0,Masters,School of Information,UC Berkeley,Bill Chambers,0,[100]
1,1,Ph.D.,EECS,UC Berkeley,Matei Zaharia,1,"[500, 250, 100]"
2,2,Masters,EECS,UC Berkeley,Michael Armbrust,1,"[250, 100]"


### Cross (Catesian) Join
- Key가 따로 없이 결합
- 왼쪽 데이터 프레임의 모든 행에, 오른쪽 데이터 프레임의 모든 행을 결합
- 각 데이터 프레임이 N, M개의 행을 가진다면, 교차 결합시 N*M개의 행이 된다
- 수가 폭발적으로 증가하니 조심해서 사용. 100% 확실할 때만 사용할 것.
- 고급 사용자는 session-level의 configuration에서 spark.sql.crossJoin.enable를 True로 설정할 수 있다

In [19]:
joinType = "cross"
graduateProgram.join(person, joinExpression, joinType).toPandas()

Unnamed: 0,id,degree,department,school,id.1,name,graduate_program,spark_status
0,0,Masters,School of Information,UC Berkeley,0,Bill Chambers,0,[100]
1,1,Ph.D.,EECS,UC Berkeley,1,Matei Zaharia,1,"[500, 250, 100]"
2,1,Ph.D.,EECS,UC Berkeley,2,Michael Armbrust,1,"[250, 100]"


In [20]:
person.crossJoin(graduateProgram).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,id.1,degree,department,school
0,0,Bill Chambers,0,[100],0,Masters,School of Information,UC Berkeley
1,0,Bill Chambers,0,[100],2,Masters,EECS,UC Berkeley
2,0,Bill Chambers,0,[100],1,Ph.D.,EECS,UC Berkeley
3,1,Matei Zaharia,1,"[500, 250, 100]",0,Masters,School of Information,UC Berkeley
4,1,Matei Zaharia,1,"[500, 250, 100]",2,Masters,EECS,UC Berkeley
5,1,Matei Zaharia,1,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
6,2,Michael Armbrust,1,"[250, 100]",0,Masters,School of Information,UC Berkeley
7,2,Michael Armbrust,1,"[250, 100]",2,Masters,EECS,UC Berkeley
8,2,Michael Armbrust,1,"[250, 100]",1,Ph.D.,EECS,UC Berkeley


## 2. Challenges When Using Joins

In [21]:
from pyspark.sql.functions import expr
person.withColumnRenamed("id", "personId")\
.join(sparkStatus, expr("array_contains(spark_status, id)")).toPandas()
## person의 spark_status가 sparkStatus의 id를 포함하고 있는지

Unnamed: 0,personId,name,graduate_program,spark_status,id,status
0,0,Bill Chambers,0,[100],100,Contributor
1,1,Matei Zaharia,1,"[500, 250, 100]",500,Vice President
2,1,Matei Zaharia,1,"[500, 250, 100]",250,PMC Member
3,1,Matei Zaharia,1,"[500, 250, 100]",100,Contributor
4,2,Michael Armbrust,1,"[250, 100]",250,PMC Member
5,2,Michael Armbrust,1,"[250, 100]",100,Contributor


In [22]:
sparkStatus.toPandas()

Unnamed: 0,id,status
0,500,Vice President
1,250,PMC Member
2,100,Contributor


In [23]:
person.toPandas()

Unnamed: 0,id,name,graduate_program,spark_status
0,0,Bill Chambers,0,[100]
1,1,Matei Zaharia,1,"[500, 250, 100]"
2,2,Michael Armbrust,1,"[250, 100]"


### Handling Duplicate Column Names

In [24]:
gradProgramDupe = graduateProgram.withColumnRenamed("id", "graduate_program")
gradProgramDupe.toPandas()

Unnamed: 0,graduate_program,degree,department,school
0,0,Masters,School of Information,UC Berkeley
1,2,Masters,EECS,UC Berkeley
2,1,Ph.D.,EECS,UC Berkeley


In [25]:
person.toPandas()

Unnamed: 0,id,name,graduate_program,spark_status
0,0,Bill Chambers,0,[100]
1,1,Matei Zaharia,1,"[500, 250, 100]"
2,2,Michael Armbrust,1,"[250, 100]"


In [26]:
joinExpr = gradProgramDupe["graduate_program"] == person["graduate_program"]

In [27]:
person.join(gradProgramDupe, joinExpr).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,graduate_program.1,degree,department,school
0,0,Bill Chambers,0,[100],0,Masters,School of Information,UC Berkeley
1,1,Matei Zaharia,1,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
2,2,Michael Armbrust,1,"[250, 100]",1,Ph.D.,EECS,UC Berkeley


In [30]:
person.join(gradProgramDupe, joinExpr).select("graduate_program").show()

AnalysisException: Reference 'graduate_program' is ambiguous, could be: graduate_program, graduate_program.

In [28]:
## Solution 1: Different join expression
person.join(gradProgramDupe, "graduate_program").select("graduate_program").show()

+----------------+
|graduate_program|
+----------------+
|               0|
|               1|
|               1|
+----------------+



In [31]:
## Solution 2: Dropping the column after the join
## 버전 문제인가 ... 안됨
person.join(gradProgramDupe, joinExpr).drop(person["graduate_program"]).select("graduate_program").toPandas()

Unnamed: 0,graduate_program
0,0
1,1
2,1


In [37]:
## Solution3: Renaming a column before the join
gradProgram3 = graduateProgram.withColumnRenamed("id", "grad_id")
joinExpr = person["graduate_program"] == gradProgram3["grad_id"]
person.join(gradProgram3, joinExpr).toPandas()

Unnamed: 0,id,name,graduate_program,spark_status,grad_id,degree,department,school
0,0,Bill Chambers,0,[100],0,Masters,School of Information,UC Berkeley
1,1,Matei Zaharia,1,"[500, 250, 100]",1,Ph.D.,EECS,UC Berkeley
2,2,Michael Armbrust,1,"[250, 100]",1,Ph.D.,EECS,UC Berkeley


## 3. How Spark Performs Joins

### Communication Strategies

In [38]:
joinExpr = person["graduate_program"] == graduateProgram["id"]
person.join(graduateProgram, joinExpr).explain()
## 여기서는 자동으로는 브로드캐스팅 안 됨

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [graduate_program#10L], [id#24L], Inner
   :- Sort [graduate_program#10L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(graduate_program#10L, 200), ENSURE_REQUIREMENTS, [plan_id=1876]
   :     +- Project [_1#0L AS id#8L, _2#1 AS name#9, _3#2L AS graduate_program#10L, _4#3 AS spark_status#11]
   :        +- Filter isnotnull(_3#2L)
   :           +- Scan ExistingRDD[_1#0L,_2#1,_3#2L,_4#3]
   +- Sort [id#24L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(id#24L, 200), ENSURE_REQUIREMENTS, [plan_id=1877]
         +- Project [_1#16L AS id#24L, _2#17 AS degree#25, _3#18 AS department#26, _4#19 AS school#27]
            +- Filter isnotnull(_1#16L)
               +- Scan ExistingRDD[_1#16L,_2#17,_3#18,_4#19]




In [41]:
from pyspark.sql.functions import broadcast
joinExpr = person["graduate_program"] == graduateProgram["id"]
person.join(broadcast(graduateProgram), joinExpr).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [graduate_program#10L], [id#24L], Inner, BuildRight, false
   :- Project [_1#0L AS id#8L, _2#1 AS name#9, _3#2L AS graduate_program#10L, _4#3 AS spark_status#11]
   :  +- Filter isnotnull(_3#2L)
   :     +- Scan ExistingRDD[_1#0L,_2#1,_3#2L,_4#3]
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true]),false), [plan_id=1909]
      +- Project [_1#16L AS id#24L, _2#17 AS degree#25, _3#18 AS department#26, _4#19 AS school#27]
         +- Filter isnotnull(_1#16L)
            +- Scan ExistingRDD[_1#16L,_2#17,_3#18,_4#19]


