# Joins and Set Operations - Practice Notebook

This notebook covers **DataFrame Joins** and **Set Operations** in Spark SQL.

## Learning Objectives
- Master different types of joins (inner, left, right, full outer)
- Understand join conditions and performance considerations
- Practice set operations (union, intersect, except)
- Handle duplicate columns and join optimization
- Work with complex join scenarios

## Sections
1. **Basic Join Operations**
2. **Different Join Types**
3. **Join Conditions and Multiple Keys**
4. **Set Operations**
5. **Join Performance and Optimization**
6. **Practice Exercises**

---


In [None]:
# Setup
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Create SparkSession
spark = SparkSession.builder.appName("Joins and Set Operations").getOrCreate()

# Create sample datasets for joins
customers_data = [
    (1, "Alice", "alice@email.com", "New York"),
    (2, "Bob", "bob@email.com", "Los Angeles"),
    (3, "Charlie", "charlie@email.com", "Chicago"),
    (4, "Diana", "diana@email.com", "Houston"),
    (5, "Eve", "eve@email.com", "Phoenix")
]

orders_data = [
    (101, 1, "2024-01-15", 250.00),
    (102, 2, "2024-01-16", 175.50),
    (103, 1, "2024-01-17", 320.00),
    (104, 3, "2024-01-18", 89.99),
    (105, 6, "2024-01-19", 150.00),  # Customer 6 doesn't exist
    (106, 2, "2024-01-20", 200.00)
]

products_data = [
    (1, "Laptop", "Electronics", 1200.00),
    (2, "Book", "Education", 25.00),
    (3, "Shirt", "Clothing", 35.00),
    (4, "Phone", "Electronics", 800.00)
]

order_items_data = [
    (101, 1, 1, 1200.00),
    (102, 2, 7, 175.00),
    (103, 1, 1, 1200.00),
    (103, 3, 4, 140.00),
    (104, 2, 3, 75.00),
    (105, 4, 1, 800.00),
    (106, 3, 6, 210.00)
]

# Create DataFrames
customers = spark.createDataFrame(customers_data, ["customer_id", "name", "email", "city"])
orders = spark.createDataFrame(orders_data, ["order_id", "customer_id", "order_date", "amount"])
products = spark.createDataFrame(products_data, ["product_id", "product_name", "category", "price"])
order_items = spark.createDataFrame(order_items_data, ["order_id", "product_id", "quantity", "total"])

print("Sample datasets:")
print("Customers:")
customers.show()
print("Orders:")
orders.show()
print("Products:")
products.show()
print("Order Items:")
order_items.show()


## 1. Basic Join Operations

Start with fundamental join operations between two DataFrames.


In [None]:
# Basic joins
print("=== BASIC JOIN OPERATIONS ===")

print("1. Inner Join (default):")
inner_join = customers.join(orders, "customer_id")
inner_join.show()

print("\n2. Inner Join with explicit condition:")
inner_join_explicit = customers.join(orders, customers.customer_id == orders.customer_id)
inner_join_explicit.show()

print("\n3. Left (Left Outer) Join:")
left_join = customers.join(orders, "customer_id", "left")
left_join.show()

print("\n4. Right (Right Outer) Join:")
right_join = customers.join(orders, "customer_id", "right")
right_join.show()

print("\n5. Full Outer Join:")
full_join = customers.join(orders, "customer_id", "full")
full_join.show()

print("\n6. Left Semi Join (like EXISTS):")
left_semi = customers.join(orders, "customer_id", "left_semi")
left_semi.show()

print("\n7. Left Anti Join (like NOT EXISTS):")
left_anti = customers.join(orders, "customer_id", "left_anti")
left_anti.show()


## 2. Complex Join Scenarios

Handle more complex join scenarios including multiple tables and conditions.


In [None]:
# Complex joins
print("=== COMPLEX JOIN SCENARIOS ===")

print("1. Multiple table joins:")
# Join customers -> orders -> order_items -> products
multi_join = customers \
    .join(orders, "customer_id") \
    .join(order_items, "order_id") \
    .join(products, "product_id")

multi_join.select("name", "order_id", "product_name", "quantity", "total").show()

print("\n2. Self join:")
# Find customers from the same city
customers_aliased = customers.alias("c1")
customers_aliased2 = customers.alias("c2")

self_join = customers_aliased.join(
    customers_aliased2,
    (F.col("c1.city") == F.col("c2.city")) & (F.col("c1.customer_id") != F.col("c2.customer_id"))
).select(
    F.col("c1.name").alias("customer1"),
    F.col("c2.name").alias("customer2"),
    F.col("c1.city")
)
self_join.show()

print("\n3. Join with complex conditions:")
# Find orders with amount greater than average product price
avg_price = products.agg(F.avg("price")).collect()[0][0]
complex_join = orders.join(
    customers,
    (orders.customer_id == customers.customer_id) & (orders.amount > avg_price)
).select("name", "order_id", "amount", F.lit(avg_price).alias("avg_product_price"))
complex_join.show()

print("\n4. Join with aggregations:")
# Customer order summary
customer_summary = orders.groupBy("customer_id").agg(
    F.count("*").alias("order_count"),
    F.sum("amount").alias("total_spent"),
    F.avg("amount").alias("avg_order_value")
)

customer_with_summary = customers.join(customer_summary, "customer_id", "left").fillna(0)
customer_with_summary.show()


## 3. Set Operations

Explore set operations like union, intersect, and except.


In [None]:
# Set operations
print("=== SET OPERATIONS ===")

# Create sample datasets for set operations
df1 = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["id", "value"])
df2 = spark.createDataFrame([(3, "C"), (4, "D"), (5, "E")], ["id", "value"])

print("Dataset 1:")
df1.show()
print("Dataset 2:")
df2.show()

print("\n1. Union (includes duplicates):")
union_result = df1.union(df2)
union_result.show()

print("\n2. Union with distinct:")
union_distinct = df1.union(df2).distinct()
union_distinct.show()

print("\n3. Intersect (common rows):")
intersect_result = df1.intersect(df2)
intersect_result.show()

print("\n4. Except (rows in df1 but not in df2):")
except_result = df1.except_(df2)
except_result.show()

print("\n5. Except (rows in df2 but not in df1):")
except_result2 = df2.except_(df1)
except_result2.show()

# Real-world example with customer data
print("\n=== REAL-WORLD SET OPERATIONS ===")

# Create customer segments
high_value_customers = customers.join(orders, "customer_id") \
    .groupBy("customer_id", "name") \
    .agg(F.sum("amount").alias("total_spent")) \
    .filter(F.col("total_spent") > 300) \
    .select("customer_id", "name")

frequent_customers = customers.join(orders, "customer_id") \
    .groupBy("customer_id", "name") \
    .agg(F.count("*").alias("order_count")) \
    .filter(F.col("order_count") > 1) \
    .select("customer_id", "name")

print("High value customers:")
high_value_customers.show()

print("Frequent customers:")
frequent_customers.show()

print("Customers who are both high value AND frequent:")
high_value_customers.intersect(frequent_customers).show()

print("High value customers who are NOT frequent:")
high_value_customers.except_(frequent_customers).show()


## 4. Practice Exercises

Complete these exercises to master joins and set operations.


In [None]:
# Additional datasets for exercises
departments_data = [
    (1, "Engineering", "Alice Johnson"),
    (2, "Sales", "Bob Smith"),
    (3, "Marketing", "Charlie Brown")
]

employees_data = [
    (1, "John", 1, 75000),
    (2, "Jane", 1, 80000),
    (3, "Mike", 2, 65000),
    (4, "Sarah", 2, 70000),
    (5, "Tom", 3, 60000),
    (6, "Lisa", None, 55000)  # No department assigned
]

projects_data = [
    (101, "Project Alpha", 1),
    (102, "Project Beta", 1),
    (103, "Project Gamma", 2),
    (104, "Project Delta", 3)
]

assignments_data = [
    (1, 101, "2024-01-01"),
    (2, 101, "2024-01-01"),
    (3, 103, "2024-01-15"),
    (4, 103, "2024-01-15"),
    (5, 104, "2024-02-01"),
    (1, 102, "2024-02-15")
]

departments = spark.createDataFrame(departments_data, ["dept_id", "dept_name", "manager"])
employees = spark.createDataFrame(employees_data, ["emp_id", "name", "dept_id", "salary"])
projects = spark.createDataFrame(projects_data, ["project_id", "project_name", "dept_id"])
assignments = spark.createDataFrame(assignments_data, ["emp_id", "project_id", "start_date"])

print("Exercise datasets:")
print("Departments:")
departments.show()
print("Employees:")
employees.show()
print("Projects:")
projects.show()
print("Assignments:")
assignments.show()

print("\n=== EXERCISE 1: Basic Joins ===")
print("TODO: Create a report showing:")
print("1. Employee name, department name, and salary")
print("2. Include employees without departments")
print("3. Show NULL for employees without departments")

# Your code here:
# employee_dept_report = employees.join(departments, ...)

print("\n=== EXERCISE 2: Multi-table Joins ===")
print("TODO: Create a comprehensive report showing:")
print("1. Employee name, department name, project name, assignment start date")
print("2. Only show employees who are assigned to projects")
print("3. Order by employee name and project name")

# Your code here:
# comprehensive_report = employees.join(assignments, ...).join(projects, ...).join(departments, ...)

print("\n=== EXERCISE 3: Aggregated Joins ===")
print("TODO: Create a department summary showing:")
print("1. Department name, manager name")
print("2. Number of employees in each department")
print("3. Average salary in each department")
print("4. Number of projects in each department")

# Your code here:
# dept_summary = departments.join(...)

print("\n=== EXERCISE 4: Set Operations ===")
print("TODO: Find:")
print("1. Employees who are assigned to projects")
print("2. Employees who are NOT assigned to any project")
print("3. Departments that have both employees and projects")

# Your code here:
# assigned_employees = employees.join(assignments, ...).select("emp_id", "name")
# unassigned_employees = employees.except_(assigned_employees)

print("\n=== EXERCISE 5: Complex Analysis ===")
print("TODO: Create an analysis showing:")
print("1. For each department, show the employee with the highest salary")
print("2. Show departments where the manager is not an employee")
print("3. Find projects that have more than one employee assigned")

# Your code here:
# Use window functions, joins, and aggregations
