In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define schema for Employee table
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

employee_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("departmentId", IntegerType(), True)
])

# Define schema for Department table
department_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

# Create Employee DataFrame
employee_data = [
    (1, "Joe", 85000, 1),
    (2, "Henry", 80000, 2),
    (3, "Sam", 60000, 2),
    (4, "Max", 90000, 1),
    (5, "Janet", 69000, 1),
    (6, "Randy", 85000, 1),
    (7, "Will", 70000, 1)
]

employee_df = spark.createDataFrame(employee_data, schema=employee_schema)

# Create Department DataFrame
department_data = [
    (1, "IT"),
    (2, "Sales")
]

department_df = spark.createDataFrame(department_data, schema=department_schema)

# Create temporary views for SQL queries
employee_df.createOrReplaceTempView("Employee")
department_df.createOrReplaceTempView("Department")

# Create persistent tables (Delta format)
print("Employee and Department tables and views created successfully.")


Employee and Department tables and views created successfully.


In [8]:
spark.sql("""
    With cte as (SELECT id, departmentId, name, salary,
           DENSE_RANK() OVER (PARTITION BY departmentId ORDER BY salary DESC) AS rn
    FROM Employee)
    
    select Department.name AS Department, cte.name AS Employee, cte.salary AS Salary from cte join Department ON cte.departmentId = Department.id where rn < 4
""").show()

+----------+--------+------+
|Department|Employee|Salary|
+----------+--------+------+
|        IT|    Will| 70000|
|        IT|   Randy| 85000|
|        IT|     Joe| 85000|
|        IT|     Max| 90000|
|     Sales|     Sam| 60000|
|     Sales|   Henry| 80000|
+----------+--------+------+



In [17]:
from pyspark.sql.functions import *
from pyspark.sql import Window

windowSpec = Window.partitionBy(col("departmentId")).orderBy(col("salary"))
rownumberdf = employee_df.withColumn(
        "rn", dense_rank().over(windowSpec)
)


In [20]:
result_df = rownumberdf.alias("r").join(department_df.alias("d"), col("r.departmentId") == col("d.id"), "inner")


In [21]:
result_df.show()

+---+-----+------+------------+---+---+-----+
| id| name|salary|departmentId| rn| id| name|
+---+-----+------+------------+---+---+-----+
|  4|  Max| 90000|           1|  4|  1|   IT|
|  6|Randy| 85000|           1|  3|  1|   IT|
|  1|  Joe| 85000|           1|  3|  1|   IT|
|  7| Will| 70000|           1|  2|  1|   IT|
|  5|Janet| 69000|           1|  1|  1|   IT|
|  2|Henry| 80000|           2|  2|  2|Sales|
|  3|  Sam| 60000|           2|  1|  2|Sales|
+---+-----+------+------------+---+---+-----+

