In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, dense_rank, max

# Initialize Spark Session (already available in Databricks notebooks)
spark = SparkSession.builder.appName("LeetCode184").getOrCreate()




In [0]:
# --- 1. Create Employee DataFrame ---
# Define schema for Employee table
employee_data = [
    (1, "Joe", 70000, 1),
    (2, "Jim", 90000, 1),
    (3, "Henry", 80000, 2),
    (4, "Sam", 60000, 2),
    (5, "Max", 90000, 1)
]
employee_columns = ["id", "name", "salary", "departmentId"]
employee_df = spark.createDataFrame(employee_data, employee_columns)

# Register as a temporary SQL view for PySpark SQL approach
employee_df.createOrReplaceTempView("Employee")
print("Employee DataFrame:")
employee_df.show()

In [0]:
# --- 2. Create Department DataFrame ---
# Define schema for Department table
department_data = [
    (1, "IT"),
    (2, "Sales")
]
department_columns = ["id", "name"]
department_df = spark.createDataFrame(department_data, department_columns)

# Register as a temporary SQL view for PySpark SQL approach
department_df.createOrReplaceTempView("Department")
print("Department DataFrame:")
department_df.show()

In [0]:
# --- Solution 1: Using PySpark SQL ---
print("--- Solution 1: PySpark SQL ---")
spark_sql_sol = spark.sql("""
SELECT d.name AS Department, e.name AS Employee, e.salary AS Salary
FROM Employee e
JOIN Department d ON e.departmentId = d.id
WHERE (e.departmentId, e.salary) IN (
    SELECT departmentId, MAX(salary) FROM Employee GROUP BY departmentId
)
ORDER BY d.name, e.name
""")
print("PySpark SQL Solution Output:")
spark_sql_sol.show()

In [0]:
# --- Solution 2: Using PySpark DataFrame API ---
print("--- Solution 2: PySpark DataFrame API ---")

from pyspark.sql.functions import col, rank
from pyspark.sql.window import Window

joined_df = employee_df.alias("e").join(department_df.alias("d"),col("e.departmentid") == col("d.id"),"inner") \
.select(
    col("d.name").alias("Department"),
    col("e.name").alias("Employee"),
    col("e.salary").alias("Salary")
)

# 2. Define a window specification
# Partition by department and order by salary in descending order
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

# 3. Apply window function to rank salaries within each department
ranked_df = joined_df.withColumn("rank", dense_rank().over(window_spec))

final_df = ranked_df.filter(col("rank") == 1).select("Department", "Employee", "Salary").orderBy("Department", "Employee")
print("PySpark DataFrame API Solution Output:")
display(final_df)

In [0]:
import builtins # Import builtins to explicitly use Python's native max function

def department_highest_salary_python(employees_data, departments_data):
    """
    Finds employees who have the highest salary in each of their departments
    using pure Python.

    Args:
        employees_data (list of dict): List of dictionaries representing the Employee table.
                                      Each dict should have 'id', 'name', 'salary', 'departmentId'.
        departments_data (list of dict): List of dictionaries representing the Department table.
                                        Each dict should have 'id', 'name'.

    Returns:
        list of dict: A list of dictionaries with 'Department', 'Employee', 'Salary'
                      for employees with the highest salary in their department.
    """

    # 1. Create a lookup for department names by ID for easy joining
    department_names = {dept['id']: dept['name'] for dept in departments_data}

    # 2. Find the maximum salary for each department in a single pass.
    # This avoids recalculating the max salary for each department repeatedly.
    # Stores max_salary_per_dept = {department_id: max_salary}
    max_salary_per_dept = {}
    for emp in employees_data:
        dept_id = emp['departmentId']
        salary = emp['salary']
        # If department not seen yet, or current salary is higher than recorded max
        if dept_id not in max_salary_per_dept or salary > max_salary_per_dept[dept_id]:
            max_salary_per_dept[dept_id] = salary

    # 3. Filter employees who have the highest salary in their department
    #    and prepare the final output format.
    result = []
    for emp in employees_data:
        dept_id = emp['departmentId']
        employee_name = emp['name']
        salary = emp['salary']

        # Get department name, handling cases where departmentId might not be found (though problem implies it exists)
        department_name = department_names.get(dept_id)
        if department_name is None:
            # If department ID is not found in departments_data, skip this employee
            continue

        # Check if this employee's salary matches the highest in their department
        # using the pre-calculated max_salary_per_dept and explicitly builtins.max
        if dept_id in max_salary_per_dept and salary == max_salary_per_dept[dept_id]:
            result.append({
                "Department": department_name,
                "Employee": employee_name,
                "Salary": salary
            })
    
    # 4. Sort the result for consistent output (optional, but good for comparison)
    # Sort by Department name, then Employee name
    result.sort(key=lambda x: (x['Department'], x['Employee']))

    # 5. Return the final list of highest salary employees
    return result

# --- Example Usage ---

# Employee data as provided in the LeetCode problem
employee_table = [
    {'id': 1, 'name': 'Joe', 'salary': 70000, 'departmentId': 1},
    {'id': 2, 'name': 'Jim', 'salary': 90000, 'departmentId': 1},
    {'id': 3, 'name': 'Henry', 'salary': 80000, 'departmentId': 2},
    {'id': 4, 'name': 'Sam', 'salary': 60000, 'departmentId': 2},
    {'id': 5, 'name': 'Max', 'salary': 90000, 'departmentId': 1},
]

# Department data as provided in the LeetCode problem
department_table = [
    {'id': 1, 'name': 'IT'},
    {'id': 2, 'name': 'Sales'},
]

# Get the solution
highest_salary_employees = department_highest_salary_python(employee_table, department_table)

print("Python Solution Output:")
for row in highest_salary_employees:
    print(row)

# Expected Output:
# {'Department': 'IT', 'Employee': 'Jim', 'Salary': 90000}
# {'Department': 'IT', 'Employee': 'Max', 'Salary': 90000}
# {'Department': 'Sales', 'Employee': 'Henry', 'Salary': 80000}
