In [0]:
%fs
ls dbfs:/FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/DepartmentRel.csv,DepartmentRel.csv,62,1690344550000
dbfs:/FileStore/tables/EmpPay.csv,EmpPay.csv,97,1689932930000
dbfs:/FileStore/tables/EmployeeRel.csv,EmployeeRel.csv,118,1690304274000
dbfs:/FileStore/tables/FlightData.csv,FlightData.csv,536,1690345771000
dbfs:/FileStore/tables/SalesData.csv,SalesData.csv,617,1689689602000
dbfs:/FileStore/tables/employees.csv,employees.csv,61,1689692632000


In [0]:
%fs
rm -r dbfs:/FileStore/tables/DepartmentRelwwwwwww.csv

In [0]:
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType,StructField, StringType,IntegerType,DoubleType

empSchema = StructType([ 
    StructField("EmpId",IntegerType(),True), \
    StructField("Emp",StringType(),True), \
    StructField("DeptId",IntegerType(),True), \
    StructField("Salary",DoubleType(),True) \
  ])

empTable = spark.read.csv(path="/FileStore/tables/EmployeeRel.csv", sep="|", header=True, schema=empSchema)
display(empTable)

EmpId,Emp,DeptId,Salary
1,Rahul,1.0,10000.0
2,Bob,2.0,12000.0
3,Jack,,20000.0
4,Arun,6.0,18000.0
5,Robin,2.0,40000.0
6,Mark,3.0,3000.0


In [0]:
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType,StructField, StringType,IntegerType,DoubleType

deptSchema = StructType([ 
    StructField("DeptId",IntegerType(),True), \
    StructField("Dept",StringType(),True) \
  ])

deptTable = spark.read.csv(path="/FileStore/tables/DepartmentRel.csv", sep="|", header=True, schema=deptSchema)
display(deptTable)

DeptId,Dept
1.0,Sales
2.0,Finance
3.0,Marketing
,HR
5.0,Reporting


#### INNER JOIN

In [0]:
innerDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"inner")
display(innerDF)

EmpId,Emp,DeptId,Salary,DeptId.1,Dept
1,Rahul,1,10000.0,1,Sales
2,Bob,2,12000.0,2,Finance
5,Robin,2,40000.0,2,Finance
6,Mark,3,3000.0,3,Marketing


In [0]:
innerDF = empTable.join(deptTable , "DeptId","inner")
display(innerDF)

DeptId,EmpId,Emp,Salary,Dept
1,1,Rahul,10000.0,Sales
2,2,Bob,12000.0,Finance
2,5,Robin,40000.0,Finance
3,6,Mark,3000.0,Marketing


#### LEFT JOIN

In [0]:
leftDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"left")
display(leftDF)

left_outerDF = empTable.join(deptTable , "DeptId","left_outer")
display(left_outerDF)


EmpId,Emp,DeptId,Salary,DeptId.1,Dept
1,Rahul,1.0,10000.0,1.0,Sales
2,Bob,2.0,12000.0,2.0,Finance
3,Jack,,20000.0,,
4,Arun,6.0,18000.0,,
5,Robin,2.0,40000.0,2.0,Finance
6,Mark,3.0,3000.0,3.0,Marketing


DeptId,EmpId,Emp,Salary,Dept
1.0,1,Rahul,10000.0,Sales
2.0,2,Bob,12000.0,Finance
,3,Jack,20000.0,
6.0,4,Arun,18000.0,
2.0,5,Robin,40000.0,Finance
3.0,6,Mark,3000.0,Marketing


#### RIGHT JOIN

In [0]:
rightDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"right")
display(rightDF)


right_outerDF = empTable.join(deptTable , "DeptId","right_outer")
display(right_outerDF)


EmpId,Emp,DeptId,Salary,DeptId.1,Dept
1.0,Rahul,1.0,10000.0,1.0,Sales
5.0,Robin,2.0,40000.0,2.0,Finance
2.0,Bob,2.0,12000.0,2.0,Finance
6.0,Mark,3.0,3000.0,3.0,Marketing
,,,,,HR
,,,,5.0,Reporting


DeptId,EmpId,Emp,Salary,Dept
1.0,1.0,Rahul,10000.0,Sales
2.0,5.0,Robin,40000.0,Finance
2.0,2.0,Bob,12000.0,Finance
3.0,6.0,Mark,3000.0,Marketing
,,,,HR
5.0,,,,Reporting


#### OUTER JOIN

In [0]:
outerDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"outer")
display(outerDF)

fullouterDF = empTable.join(deptTable , "DeptId","full")
display(fullouterDF)

full_outerDF = empTable.join(deptTable , "DeptId","full_outer")
display(full_outerDF)

EmpId,Emp,DeptId,Salary,DeptId.1,Dept
3.0,Jack,,20000.0,,
,,,,,HR
1.0,Rahul,1.0,10000.0,1.0,Sales
2.0,Bob,2.0,12000.0,2.0,Finance
5.0,Robin,2.0,40000.0,2.0,Finance
6.0,Mark,3.0,3000.0,3.0,Marketing
,,,,5.0,Reporting
4.0,Arun,6.0,18000.0,,


DeptId,EmpId,Emp,Salary,Dept
,3.0,Jack,20000.0,
,,,,HR
1.0,1.0,Rahul,10000.0,Sales
2.0,2.0,Bob,12000.0,Finance
2.0,5.0,Robin,40000.0,Finance
3.0,6.0,Mark,3000.0,Marketing
5.0,,,,Reporting
6.0,4.0,Arun,18000.0,


DeptId,EmpId,Emp,Salary,Dept
,3.0,Jack,20000.0,
,,,,HR
1.0,1.0,Rahul,10000.0,Sales
2.0,2.0,Bob,12000.0,Finance
2.0,5.0,Robin,40000.0,Finance
3.0,6.0,Mark,3000.0,Marketing
5.0,,,,Reporting
6.0,4.0,Arun,18000.0,


####LEFT ANTI JOIN

In [0]:
leftantiDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"leftanti")
display(leftantiDF)

EmpId,Emp,DeptId,Salary
3,Jack,,20000.0
4,Arun,6.0,18000.0


#####LEFT SEMI JOIN  OR SEMI JOIN

In [0]:
leftsemiDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"leftsemi")
display(leftsemiDF)

semiDF = empTable.join(deptTable , empTable.DeptId==deptTable.DeptId,"semi")
display(semiDF)


EmpId,Emp,DeptId,Salary
1,Rahul,1,10000.0
2,Bob,2,12000.0
5,Robin,2,40000.0
6,Mark,3,3000.0


EmpId,Emp,DeptId,Salary
1,Rahul,1,10000.0
2,Bob,2,12000.0
5,Robin,2,40000.0
6,Mark,3,3000.0


#### SQL Expression

In [0]:
empTable.createOrReplaceTempView("empTempVW")
deptTable.createOrReplaceTempView("deptTempVW")


In [0]:

joinDF = spark.sql("select * from empTempVW e, deptTempVW d where e.DeptId == d.DeptId") \
.show(truncate=False)

joinDF2 = spark.sql("select * from empTempVW e INNER JOIN deptTempVW d ON e.DeptId == d.DeptId") \
.show(truncate=False)

+-----+-----+------+-------+------+---------+
|EmpId|Emp  |DeptId|Salary |DeptId|Dept     |
+-----+-----+------+-------+------+---------+
|1    |Rahul|1     |10000.0|1     |Sales    |
|2    |Bob  |2     |12000.0|2     |Finance  |
|5    |Robin|2     |40000.0|2     |Finance  |
|6    |Mark |3     |3000.0 |3     |Marketing|
+-----+-----+------+-------+------+---------+

+-----+-----+------+-------+------+---------+
|EmpId|Emp  |DeptId|Salary |DeptId|Dept     |
+-----+-----+------+-------+------+---------+
|1    |Rahul|1     |10000.0|1     |Sales    |
|2    |Bob  |2     |12000.0|2     |Finance  |
|5    |Robin|2     |40000.0|2     |Finance  |
|6    |Mark |3     |3000.0 |3     |Marketing|
+-----+-----+------+-------+------+---------+

