In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType
from pyspark.sql.functions import col, avg, round

# Start SparkSession
spark = SparkSession.builder.appName("SalaryComparison").getOrCreate()

# Define schema
schema = StructType([
    StructField("age", LongType(), True),
    StructField("department", StringType(), True),
    StructField("employee_title", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("id", LongType(), False),  # employee ID
    StructField("last_name", StringType(), True),
    StructField("manager_id", LongType(), True),
    StructField("salary", LongType(), True)
])

# Sample data
data = [
    (30, "Engineering", "Engineer", "Alice", "F", 1, "Smith", 4, 9000),
    (32, "Engineering", "Engineer", "Bob", "M", 2, "Brown", 4, 8500),
    (29, "Engineering", "Engineer", "Charlie", "M", 3, "Davis", 4, 8800),
    (45, "Engineering", "Manager", "David", "M", 4, "Wilson", None, 12000),
    (25, "HR", "HR Specialist", "Eve", "F", 5, "Taylor", 6, 6000),
    (26, "HR", "HR Specialist", "Frank", "M", 6, "Moore", None, 7500),
    (28, "Marketing", "Marketer", "Grace", "F", 7, "Anderson", 8, 6700),
    (35, "Marketing", "Manager", "Henry", "M", 8, "Thomas", None, 9000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the dataset

df.createOrReplaceTempView("Employees")

df.show()


+---+-----------+--------------+----------+------+---+---------+----------+------+
|age| department|employee_title|first_name|gender| id|last_name|manager_id|salary|
+---+-----------+--------------+----------+------+---+---------+----------+------+
| 30|Engineering|      Engineer|     Alice|     F|  1|    Smith|         4|  9000|
| 32|Engineering|      Engineer|       Bob|     M|  2|    Brown|         4|  8500|
| 29|Engineering|      Engineer|   Charlie|     M|  3|    Davis|         4|  8800|
| 45|Engineering|       Manager|     David|     M|  4|   Wilson|      null| 12000|
| 25|         HR| HR Specialist|       Eve|     F|  5|   Taylor|         6|  6000|
| 26|         HR| HR Specialist|     Frank|     M|  6|    Moore|      null|  7500|
| 28|  Marketing|      Marketer|     Grace|     F|  7| Anderson|         8|  6700|
| 35|  Marketing|       Manager|     Henry|     M|  8|   Thomas|      null|  9000|
+---+-----------+--------------+----------+------+---+---------+----------+------+



In [5]:
# md Oracle is comparing the monthly wages of their employees in each department to those of their managers and co-workers.
# You have been tasked with creating a table that compares an employee's salary to that of their manager and to the average salary of their department.
# It is expected that the department manager's salary and the average salary of employee's from that department are in their own separate column.
# Order the employee's salary from highest to lowest based on their department.
# Your output should contain the department, employee id, salary of that employee, salary of that employee's manager and the average salary from employee's within that department rounded to the nearest whole number.


In [9]:
spark.sql("""
    SELECT 
        e1.department, 
        e1.id AS employee_id, 
        e1.salary AS employee_salary, 
        e2.salary AS manager_salary,
        ROUND(AVG(e1.salary) OVER(PARTITION BY e1.department)) AS avg_department_salary
    FROM Employees e1
    LEFT JOIN Employees e2
        ON e1.manager_id = e2.id
""").show()


+-----------+-----------+---------------+--------------+---------------------+
| department|employee_id|employee_salary|manager_salary|avg_department_salary|
+-----------+-----------+---------------+--------------+---------------------+
|Engineering|          1|           9000|         12000|               9575.0|
|Engineering|          2|           8500|         12000|               9575.0|
|Engineering|          3|           8800|         12000|               9575.0|
|Engineering|          4|          12000|          null|               9575.0|
|         HR|          5|           6000|          7500|               6750.0|
|         HR|          6|           7500|          null|               6750.0|
|  Marketing|          7|           6700|          9000|               7850.0|
|  Marketing|          8|           9000|          null|               7850.0|
+-----------+-----------+---------------+--------------+---------------------+

