In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('test').getOrCreate()

In [2]:
employees = [
    (1, "Ravi", "IT", 50000),
    (2, "Priya", "HR", 45000),
    (3, "Ankit", "IT", 55000),
    (4, "Lakshmi", "Finance", 60000),
    (5, "John", None, 40000)
]

departments = [
    ("IT", "Bengaluru"),
    ("HR", "Hyderabad"),
    ("Finance", "Chennai"),
    ("Marketing", "Pune")
]

df_emp = spark.createDataFrame(employees, ["emp_id", "name", "dept_name", "salary"])
df_dept = spark.createDataFrame(departments, ["dept_name", "location"])

print("Employees DataFrame:")
df_emp.show()

print("Departments DataFrame:")
df_dept.show()

Employees DataFrame:
+------+-------+---------+------+
|emp_id|   name|dept_name|salary|
+------+-------+---------+------+
|     1|   Ravi|       IT| 50000|
|     2|  Priya|       HR| 45000|
|     3|  Ankit|       IT| 55000|
|     4|Lakshmi|  Finance| 60000|
|     5|   John|     NULL| 40000|
+------+-------+---------+------+

Departments DataFrame:
+---------+---------+
|dept_name| location|
+---------+---------+
|       IT|Bengaluru|
|       HR|Hyderabad|
|  Finance|  Chennai|
|Marketing|     Pune|
+---------+---------+



### Inner Join
Returns rows where the join condition is met in both DataFrames.

In [3]:
inner_join_df = df_emp.join(df_dept, on="dept_name", how="inner")
print("Inner Join Result:")
inner_join_df.show()

Inner Join Result:
+---------+------+-------+------+---------+
|dept_name|emp_id|   name|salary| location|
+---------+------+-------+------+---------+
|  Finance|     4|Lakshmi| 60000|  Chennai|
|       HR|     2|  Priya| 45000|Hyderabad|
|       IT|     1|   Ravi| 50000|Bengaluru|
|       IT|     3|  Ankit| 55000|Bengaluru|
+---------+------+-------+------+---------+



### Left Join (Left Outer Join)
Returns all rows from the left DataFrame and the matched rows from the right DataFrame. If there's no match, `null` is used for the right DataFrame's columns.

In [4]:
left_join_df = df_emp.join(df_dept, on="dept_name", how="left")
print("Left Join Result:")
left_join_df.show()

Left Join Result:
+---------+------+-------+------+---------+
|dept_name|emp_id|   name|salary| location|
+---------+------+-------+------+---------+
|       HR|     2|  Priya| 45000|Hyderabad|
|       IT|     1|   Ravi| 50000|Bengaluru|
|     NULL|     5|   John| 40000|     NULL|
|  Finance|     4|Lakshmi| 60000|  Chennai|
|       IT|     3|  Ankit| 55000|Bengaluru|
+---------+------+-------+------+---------+



### Right Join (Right Outer Join)
Returns all rows from the right DataFrame and the matched rows from the left DataFrame. If there's no match, `null` is used for the left DataFrame's columns.

In [5]:
right_join_df = df_emp.join(df_dept, on="dept_name", how="right")
print("Right Join Result:")
right_join_df.show()

Right Join Result:
+---------+------+-------+------+---------+
|dept_name|emp_id|   name|salary| location|
+---------+------+-------+------+---------+
|       HR|     2|  Priya| 45000|Hyderabad|
|       IT|     3|  Ankit| 55000|Bengaluru|
|       IT|     1|   Ravi| 50000|Bengaluru|
|  Finance|     4|Lakshmi| 60000|  Chennai|
|Marketing|  NULL|   NULL|  NULL|     Pune|
+---------+------+-------+------+---------+



### Full Outer Join
Returns all rows from both DataFrames, with `null` values for columns where there is no match.

In [6]:
full_outer_df = df_emp.join(df_dept, on="dept_name", how="outer")
print("Full Outer Join Result:")
full_outer_df.show()

Full Outer Join Result:
+---------+------+-------+------+---------+
|dept_name|emp_id|   name|salary| location|
+---------+------+-------+------+---------+
|     NULL|     5|   John| 40000|     NULL|
|  Finance|     4|Lakshmi| 60000|  Chennai|
|       HR|     2|  Priya| 45000|Hyderabad|
|       IT|     1|   Ravi| 50000|Bengaluru|
|       IT|     3|  Ankit| 55000|Bengaluru|
|Marketing|  NULL|   NULL|  NULL|     Pune|
+---------+------+-------+------+---------+



### Left Semi Join
Returns rows from the left DataFrame where there is a match in the right DataFrame, but only includes columns from the left DataFrame.

In [7]:
semi_join_df = df_emp.join(df_dept, on="dept_name", how="left_semi")
print("Left Semi Join Result:")
semi_join_df.show()

Left Semi Join Result:
+---------+------+-------+------+
|dept_name|emp_id|   name|salary|
+---------+------+-------+------+
|  Finance|     4|Lakshmi| 60000|
|       HR|     2|  Priya| 45000|
|       IT|     1|   Ravi| 50000|
|       IT|     3|  Ankit| 55000|
+---------+------+-------+------+



### Left Anti Join
Returns rows from the left DataFrame where there is NO match in the right DataFrame. Only includes columns from the left DataFrame.

In [8]:
anti_join_df = df_emp.join(df_dept, on="dept_name", how="left_anti")
print("Left Anti Join Result:")
anti_join_df.show()

Left Anti Join Result:
+---------+------+----+------+
|dept_name|emp_id|name|salary|
+---------+------+----+------+
|     NULL|     5|John| 40000|
+---------+------+----+------+



### Cross Join
Returns the Cartesian product of the two DataFrames, i.e., every row from the left DataFrame is combined with every row from the right DataFrame.

In [9]:
cross_join_df = df_emp.crossJoin(df_dept)
print("Cross Join Result:")
cross_join_df.show(10)

Cross Join Result:
+------+-----+---------+------+---------+---------+
|emp_id| name|dept_name|salary|dept_name| location|
+------+-----+---------+------+---------+---------+
|     1| Ravi|       IT| 50000|       IT|Bengaluru|
|     1| Ravi|       IT| 50000|       HR|Hyderabad|
|     2|Priya|       HR| 45000|       IT|Bengaluru|
|     2|Priya|       HR| 45000|       HR|Hyderabad|
|     1| Ravi|       IT| 50000|  Finance|  Chennai|
|     1| Ravi|       IT| 50000|Marketing|     Pune|
|     2|Priya|       HR| 45000|  Finance|  Chennai|
|     2|Priya|       HR| 45000|Marketing|     Pune|
|     3|Ankit|       IT| 55000|       IT|Bengaluru|
|     3|Ankit|       IT| 55000|       HR|Hyderabad|
+------+-----+---------+------+---------+---------+
only showing top 10 rows


### Join with Multiple Conditions
Demonstrates joining with multiple conditions using a new `df_dept2` DataFrame.

In [10]:
dept_extended = [
    ("IT", "Bengaluru", "India"),
    ("HR", "Hyderabad", "India"),
    ("Finance", "Chennai", "India"),
    ("Marketing", "Pune", "India")
]

df_dept2 = spark.createDataFrame(dept_extended, ["dept_name", "location", "country"])

multi_cond_df = df_emp.join(
    df_dept2,
    (df_emp.dept_name == df_dept2.dept_name) & (df_dept2.country == "India"),
    "inner"
)
print("Join with Multiple Conditions Result:")
multi_cond_df.show()

Join with Multiple Conditions Result:
+------+-------+---------+------+---------+---------+-------+
|emp_id|   name|dept_name|salary|dept_name| location|country|
+------+-------+---------+------+---------+---------+-------+
|     4|Lakshmi|  Finance| 60000|  Finance|  Chennai|  India|
|     2|  Priya|       HR| 45000|       HR|Hyderabad|  India|
|     1|   Ravi|       IT| 50000|       IT|Bengaluru|  India|
|     3|  Ankit|       IT| 55000|       IT|Bengaluru|  India|
+------+-------+---------+------+---------+---------+-------+



### Joins Using Aliases
Shows how to use aliases for DataFrames to handle column name ambiguities, especially when selecting specific columns after a join.

In [11]:
emp_alias = df_emp.alias("e")
dept_alias = df_dept.alias("d")

alias_join_df = emp_alias.join(
    dept_alias,
    emp_alias.dept_name == dept_alias.dept_name,
    "inner"
).select("e.emp_id", "e.name", "d.location")

print("Join Using Aliases Result:")
alias_join_df.show()

Join Using Aliases Result:
+------+-------+---------+
|emp_id|   name| location|
+------+-------+---------+
|     4|Lakshmi|  Chennai|
|     2|  Priya|Hyderabad|
|     1|   Ravi|Bengaluru|
|     3|  Ankit|Bengaluru|
+------+-------+---------+



Handling Duplicate Columns After Join

In [12]:
join_with_duplicates = df_emp.join(df_dept, "dept_name", "inner")
clean_df = join_with_duplicates.select("emp_id", "name", "dept_name", "location", "salary")
clean_df.show()

+------+-------+---------+---------+------+
|emp_id|   name|dept_name| location|salary|
+------+-------+---------+---------+------+
|     4|Lakshmi|  Finance|  Chennai| 60000|
|     2|  Priya|       HR|Hyderabad| 45000|
|     1|   Ravi|       IT|Bengaluru| 50000|
|     3|  Ankit|       IT|Bengaluru| 55000|
+------+-------+---------+---------+------+



Validate Join Results

In [13]:
missing_dept_employees = df_emp.join(df_dept, "dept_name", "left_anti").count()
print(f"Employees with missing department: {missing_dept_employees}")

Employees with missing department: 1
