# Joins

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml.stat as ml_stat
import pyspark.sql.functions as func
import pyspark.sql.types as types
from pyspark.sql import Window
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

In [14]:
# Test Data Frames
df1 = spark.createDataFrame([
    ('1', 'Joe', '70000', '1'),
    ('2', 'Henry', '80000', '2'),
    ('3', 'Tom',   '60000', '2'),
    ('3', 'Judy',   '60000', '3'),
    ('3', 'Cathy',   '60000', '5')
    ],
    ['Id', 'Name', 'Salary','DepartmentId']
)
df2 = spark.createDataFrame([
    ('1', 'Headquarter'),
    ('2', 'Finance'),
    ('3', 'Sales'),
    ('4', 'HR')],
    ['DepartmentId', 'DepartmentName']
)

### Join Types

In [22]:
# Inner Join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='inner') \
    .drop(df2['DepartmentId']) \
    .show()

+---+-----+------+------------+--------------+
| Id| Name|Salary|DepartmentId|DepartmentName|
+---+-----+------+------------+--------------+
|  3| Judy| 60000|           3|         Sales|
|  1|  Joe| 70000|           1|   Headquarter|
|  2|Henry| 80000|           2|       Finance|
|  3|  Tom| 60000|           2|       Finance|
+---+-----+------+------------+--------------+



In [23]:
# Left Join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='left') \
    .drop(df2['DepartmentId']) \
    .show()

+---+-----+------+------------+--------------+
| Id| Name|Salary|DepartmentId|DepartmentName|
+---+-----+------+------------+--------------+
|  3| Judy| 60000|           3|         Sales|
|  3|Cathy| 60000|           5|          null|
|  1|  Joe| 70000|           1|   Headquarter|
|  2|Henry| 80000|           2|       Finance|
|  3|  Tom| 60000|           2|       Finance|
+---+-----+------+------------+--------------+



In [24]:
# Right Join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='right') \
    .drop(df2['DepartmentId']) \
    .show()

+----+-----+------+------------+--------------+
|  Id| Name|Salary|DepartmentId|DepartmentName|
+----+-----+------+------------+--------------+
|   3| Judy| 60000|           3|         Sales|
|   1|  Joe| 70000|           1|   Headquarter|
|null| null|  null|        null|            HR|
|   2|Henry| 80000|           2|       Finance|
|   3|  Tom| 60000|           2|       Finance|
+----+-----+------+------------+--------------+



In [27]:
# Full outer join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='full') \
    .drop(df2['DepartmentId']) \
    .show()

+----+-----+------+------------+--------------+
|  Id| Name|Salary|DepartmentId|DepartmentName|
+----+-----+------+------------+--------------+
|   3| Judy| 60000|           3|         Sales|
|   3|Cathy| 60000|           5|          null|
|   1|  Joe| 70000|           1|   Headquarter|
|null| null|  null|        null|            HR|
|   2|Henry| 80000|           2|       Finance|
|   3|  Tom| 60000|           2|       Finance|
+----+-----+------+------------+--------------+



In [28]:
# Left semi join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='left_semi') \
    .drop(df2['DepartmentId']) \
    .show()

+---+-----+------+------------+
| Id| Name|Salary|DepartmentId|
+---+-----+------+------------+
|  3| Judy| 60000|           3|
|  1|  Joe| 70000|           1|
|  2|Henry| 80000|           2|
|  3|  Tom| 60000|           2|
+---+-----+------+------------+



In [30]:
# Right anti join
df1.join(df2, on=df1['DepartmentId']==df2['DepartmentId'], how='left_anti') \
    .drop(df2['DepartmentId']) \
    .show()

+---+-----+------+------------+
| Id| Name|Salary|DepartmentId|
+---+-----+------+------------+
|  3|Cathy| 60000|           5|
+---+-----+------+------------+



In [36]:
# Cross join
df2.join(df2, on=None, how='cross') \
    .show()

IllegalArgumentException: 'requirement failed: Unsupported using join type Cross'