In [22]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, lit, when, \
desc, asc, cast, like, count, min, max, median, percentile
from pyspark.sql.types import *

spark = SparkSession.Builder().appName('join()')\
.config("spark.sql.crossJoin.enabled",True).getOrCreate()

In [12]:
empData = [
    (1,"Rohit", 3000, 1),
    (2,"Ajay", 2000, 1),
    (3,"Hemma", 2000, 2),
    (4,"Arti", 2000, 3),
    (5,"Kanchan", 2000, 4),
]  

empDataSchema = ['empID', 'Name', 'Salary', 'depID']

depData = [
    (1,"Data"),
    (2,"HR"),
    (3,"Marketing"),
]

depDataSchema = ['ID', 'depName']

empDF = spark.createDataFrame(data = empData, schema=empDataSchema)
depDF = spark.createDataFrame(data = depData, schema=depDataSchema)
print('Employee DataFrame -->')
empDF.show()
print('Department DataFrame -->')
depDF.show()

Employee DataFrame -->
+-----+-------+------+-----+
|empID|   Name|Salary|depID|
+-----+-------+------+-----+
|    1|  Rohit|  3000|    1|
|    2|   Ajay|  2000|    1|
|    3|  Hemma|  2000|    2|
|    4|   Arti|  2000|    3|
|    5|Kanchan|  2000|    4|
+-----+-------+------+-----+

Department DataFrame -->
+---+---------+
| ID|  depName|
+---+---------+
|  1|     Data|
|  2|       HR|
|  3|Marketing|
+---+---------+



In [35]:
print('Result after Inner Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'inner').show()
print('Result after Left Outer Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'left').show()
print('Result after Right Outer Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'right').show()
print('Result after Full Outer Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'full').show()
print('Result after Cross Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'cross').show()
print('Result after Left Semi Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'leftsemi').show()
print('Result after Semi Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'semi').show()
print('Result after Left Anti Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'leftanti').show()
print('Result after Anti Join -->')
empDF.join(depDF, empDF.depID == depDF.ID, 'anti').show()

Result after Inner Join -->


                                                                                

+-----+-----+------+-----+---+---------+
|empID| Name|Salary|depID| ID|  depName|
+-----+-----+------+-----+---+---------+
|    1|Rohit|  3000|    1|  1|     Data|
|    2| Ajay|  2000|    1|  1|     Data|
|    3|Hemma|  2000|    2|  2|       HR|
|    4| Arti|  2000|    3|  3|Marketing|
+-----+-----+------+-----+---+---------+

Result after Left Outer Join -->


                                                                                

+-----+-------+------+-----+----+---------+
|empID|   Name|Salary|depID|  ID|  depName|
+-----+-------+------+-----+----+---------+
|    1|  Rohit|  3000|    1|   1|     Data|
|    2|   Ajay|  2000|    1|   1|     Data|
|    3|  Hemma|  2000|    2|   2|       HR|
|    4|   Arti|  2000|    3|   3|Marketing|
|    5|Kanchan|  2000|    4|NULL|     NULL|
+-----+-------+------+-----+----+---------+

Result after Right Outer Join -->
+-----+-----+------+-----+---+---------+
|empID| Name|Salary|depID| ID|  depName|
+-----+-----+------+-----+---+---------+
|    2| Ajay|  2000|    1|  1|     Data|
|    1|Rohit|  3000|    1|  1|     Data|
|    3|Hemma|  2000|    2|  2|       HR|
|    4| Arti|  2000|    3|  3|Marketing|
+-----+-----+------+-----+---+---------+

Result after Full Outer Join -->
+-----+-------+------+-----+----+---------+
|empID|   Name|Salary|depID|  ID|  depName|
+-----+-------+------+-----+----+---------+
|    1|  Rohit|  3000|    1|   1|     Data|
|    2|   Ajay|  2000|    1|   

                                                                                

+-----+-----+------+-----+---+---------+
|empID| Name|Salary|depID| ID|  depName|
+-----+-----+------+-----+---+---------+
|    1|Rohit|  3000|    1|  1|     Data|
|    2| Ajay|  2000|    1|  1|     Data|
|    3|Hemma|  2000|    2|  2|       HR|
|    4| Arti|  2000|    3|  3|Marketing|
+-----+-----+------+-----+---+---------+

Result after Left Semi Join -->


                                                                                

+-----+-----+------+-----+
|empID| Name|Salary|depID|
+-----+-----+------+-----+
|    1|Rohit|  3000|    1|
|    2| Ajay|  2000|    1|
|    3|Hemma|  2000|    2|
|    4| Arti|  2000|    3|
+-----+-----+------+-----+

Result after Semi Join -->
+-----+-----+------+-----+
|empID| Name|Salary|depID|
+-----+-----+------+-----+
|    1|Rohit|  3000|    1|
|    2| Ajay|  2000|    1|
|    3|Hemma|  2000|    2|
|    4| Arti|  2000|    3|
+-----+-----+------+-----+

Result after Left Anti Join -->


                                                                                

+-----+-------+------+-----+
|empID|   Name|Salary|depID|
+-----+-------+------+-----+
|    5|Kanchan|  2000|    4|
+-----+-------+------+-----+

Result after Anti Join -->
+-----+-------+------+-----+
|empID|   Name|Salary|depID|
+-----+-------+------+-----+
|    5|Kanchan|  2000|    4|
+-----+-------+------+-----+



In [25]:
empDF.crossJoin(depDF).show()



+-----+-------+------+-----+---+---------+
|empID|   Name|Salary|depID| ID|  depName|
+-----+-------+------+-----+---+---------+
|    1|  Rohit|  3000|    1|  1|     Data|
|    1|  Rohit|  3000|    1|  2|       HR|
|    1|  Rohit|  3000|    1|  3|Marketing|
|    2|   Ajay|  2000|    1|  1|     Data|
|    2|   Ajay|  2000|    1|  2|       HR|
|    2|   Ajay|  2000|    1|  3|Marketing|
|    3|  Hemma|  2000|    2|  1|     Data|
|    3|  Hemma|  2000|    2|  2|       HR|
|    3|  Hemma|  2000|    2|  3|Marketing|
|    4|   Arti|  2000|    3|  1|     Data|
|    4|   Arti|  2000|    3|  2|       HR|
|    4|   Arti|  2000|    3|  3|Marketing|
|    5|Kanchan|  2000|    4|  1|     Data|
|    5|Kanchan|  2000|    4|  2|       HR|
|    5|Kanchan|  2000|    4|  3|Marketing|
+-----+-------+------+-----+---+---------+



                                                                                

#### Self Join

In [36]:
data = [
    (1,'Rohit', 0),
    (2,'Ajay', 1),
    (3,'Hema', 2),
]

schema = ['empID', 'empName', 'managerID']

df = spark.createDataFrame(data, schema)
df.show()

+-----+-------+---------+
|empID|empName|managerID|
+-----+-------+---------+
|    1|  Rohit|        0|
|    2|   Ajay|        1|
|    3|   Hema|        2|
+-----+-------+---------+



In [55]:
df.alias('empDF').join(df.alias('managerDF'), \
                       col('empDF.managerID') == col('managerDF.empID'),\
                       'left')\
.select(
    col('empDF.empID').alias('empID'),
    col('empDF.empName').alias('empName'),
    col('empDF.managerID').alias('managerID'),
    col('managerDF.empName').alias('managerName'),
       ).show()

+-----+-------+---------+-----------+
|empID|empName|managerID|managerName|
+-----+-------+---------+-----------+
|    1|  Rohit|        0|       NULL|
|    2|   Ajay|        1|      Rohit|
|    3|   Hema|        2|       Ajay|
+-----+-------+---------+-----------+



In [56]:
spark.stop()