## 08-pyspark-join.py

In [0]:
# 08-pyspark-join.py
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()

emp = [(1, "Sunit", 1, "2018", "10", "M", 3000), \
       (2, "Rohit", 1, "2010", "20", "M", 4000), \
       (3, "Alam", 1, "2010", "10", "M", 1000), \
       (4, "Joy", 2,"2005", "10", "F", 2000), \
       (5, "Bidhun", 2, "2010", "40", "M", 3000), \
       (6, "Jayita", 2, "2010", "50", "F", 4000)]
empColumns = ["emp_id", "name", "superior_emp_id", "year_joined", \
              "emp_dept_id", "gender", "salary"]

empDF = spark.createDataFrame(data = emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate = False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+------+---------------+-----------+-----------+------+------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+------+---------------+-----------+-----------+------+------+
|1     |Sunit |1              |2018       |10         |M     |3000  |
|2     |Rohit |1              |2010       |20         |M     |4000  |
|3     |Alam  |1              |2010       |10         |M     |1000  |
|4     |Joy   |2              |2005       |10         |F     |2000  |
|5     |Bidhun|2              |2010       |40         |M     |3000  |
|6     |Jayita|2              |2010       |50         |F     |4000  |
+------+------+---------------+-----------+-----------+------+------+



In [0]:
dept = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
deptColumns = ["dept_name", "dept_id"]
deptDF = spark.createDataFrame(data = dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate = False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



**The default join value is inner. Must be one of: inner, cross, outer, full, fullouter, full_outer, left, leftouter, left_outer, right, rightouter, right_outer, semi, leftsemi, left_semi, anti, leftanti and left_anti.**<br><br>

* **inner -	INNER JOIN in SQL**
* **outer, full, fullouter, full_outer - FULL OUTER JOIN in SQL**
* **left, leftouter, left_outer - LEFT JOIN in SQL**
* **right, rightouter, right_outer - RIGHT JOIN in SQL**

In [0]:
# inner join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "inner") \
     .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# outer join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "outer") \
    .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null  |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
|6     |Jayita|2              |2010       |50         |F     |4000  |null     |null   |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# cross join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "cross") \
     .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# full join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "full") \
    .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null  |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
|6     |Jayita|2              |2010       |50         |F     |4000  |null     |null   |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# full outer join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "fullouter") \
    .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null  |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
|6     |Jayita|2              |2010       |50         |F     |4000  |null     |null   |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# left join    
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "left") \
    .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
|6     |Jayita|2              |2010       |50         |F     |4000  |null     |null   |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# left outer join    
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftouter") \
   .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
|6     |Jayita|2              |2010       |50         |F     |4000  |null     |null   |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# right join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "right") \
   .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null  |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# right outer join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "rightouter") \
   .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|null  |null  |null           |null       |null       |null  |null  |Sales    |30     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
# left semi join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftsemi") \
   .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+------+---------------+-----------+-----------+------+------+
|1     |Sunit |1              |2018       |10         |M     |3000  |
|3     |Alam  |1              |2010       |10         |M     |1000  |
|4     |Joy   |2              |2005       |10         |F     |2000  |
|2     |Rohit |1              |2010       |20         |M     |4000  |
|5     |Bidhun|2              |2010       |40         |M     |3000  |
+------+------+---------------+-----------+-----------+------+------+



In [0]:
# left anti join
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftanti") \
   .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+------+---------------+-----------+-----------+------+------+
|6     |Jayita|2              |2010       |50         |F     |4000  |
+------+------+---------------+-----------+-----------+------+------+



In [0]:
empDF.alias("emp1").join(empDF.alias("emp2"), \
      col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner") \
      .select(col("emp1.emp_id"), col("emp1.name"), \
      col("emp2.emp_id").alias("superior_emp_id"), \
      col("emp2.name").alias("superior_emp_name")) \
      .show(truncate = False)

empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")

+------+------+---------------+-----------------+
|emp_id|name  |superior_emp_id|superior_emp_name|
+------+------+---------------+-----------------+
|1     |Sunit |1              |Sunit            |
|2     |Rohit |1              |Sunit            |
|3     |Alam  |1              |Sunit            |
|4     |Joy   |2              |Rohit            |
|5     |Bidhun|2              |Rohit            |
|6     |Jayita|2              |Rohit            |
+------+------+---------------+-----------------+



In [0]:
joinDF = spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \
  .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") \
  .show(truncate = False)

+------+------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name  |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Sunit |1              |2018       |10         |M     |3000  |Finance  |10     |
|3     |Alam  |1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Joy   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rohit |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Bidhun|2              |2010       |40         |M     |3000  |IT       |40     |
+------+------+---------------+-----------+-----------+------+------+---------+-------+

