In [1]:
from pyspark import SparkContext

In [4]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('jointables').getOrCreate()
spark

In [5]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+-----

In [6]:
dept = [("Finance",10, "Bill"), \
    ("Marketing",20, "Joe"), \
    ("Sales",30, "Smith"), \
    ("IT",40, "Brown") \
  ]

In [7]:
deptColumns = ["dept_name","dept_id","emp_name"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)
 |-- emp_name: string (nullable = true)

+---------+-------+--------+
|dept_name|dept_id|emp_name|
+---------+-------+--------+
|Finance  |10     |Bill    |
|Marketing|20     |Joe     |
|Sales    |30     |Smith   |
|IT       |40     |Brown   |
+---------+-------+--------+



In [8]:
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"inner").show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+



In [9]:
# Full Outer Join

empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"outer").show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"full").show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"fullouter").show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |
+------+--

In [10]:
#  Left Outer Join
# vaghti column moshtarak nadashte bashan (emp_id, dept_id)
empDF.join(deptDF, empDF.emp_id == deptDF.dept_id, "left") \
    .show(truncate = False)

# rooye column moshtarak 
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "left") \
    .show(truncate = False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |
|5     |Brown   |2              |2010       |40         |      |-1    |null     |null   |null    |
|1     |Smith   |-1             |2018       |10         |M     |3000  |null     |null   |null    |
|3     |Williams|1              |2010       |10         |M     |1000  |null     |null   |null    |
|2     |Rose    |1              |2010       |20         |M     |4000  |null     |null   |null    |
|4     |Jones   |2              |2005       |10         |F     |2000  |null     |null   |null    |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+

+------+-

In [11]:
leftjoin = empDF.join(deptDF, (empDF.emp_dept_id == deptDF.dept_id) & (empDF.name == deptDF.emp_name), "left") 
leftjoin.show(truncate= False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|4     |Jones   |2              |2005       |10         |F     |2000  |null     |null   |null    |
|2     |Rose    |1              |2010       |20         |M     |4000  |null     |null   |null    |
|3     |Williams|1              |2010       |10         |M     |1000  |null     |null   |null    |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |null     |null   |null    |
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+



In [12]:
# Right Outer Join


empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"right") \
   .show(truncate=False)
empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,"rightouter") \
   .show(truncate=False)


+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+

+------+-

## Using SQL Expression

Since PySpark SQL support native SQL syntax, we can also write join operations after creating temporary tables on DataFrames and use these tables on `spark.sql()`.

In [13]:
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")

In [14]:
joinDF = spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") 

joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id")

In [15]:
joinDF.show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+

