You are given the data of employees along with their salary and department. Write an SQL to find list of employees who have salary greater than average employee salary of the company.  However, while calculating the company average salary to compare with an employee salary do not consider salaries of that employee's department, display the output in ascending order of employee ids.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize a Spark session
spark = SparkSession.builder.appName("Employee Data").getOrCreate()

# Define the data
data = [
    (100, 40000, "Analytics"),
    (101, 30000, "Analytics"),
    (102, 50000, "Analytics"),
    (103, 45000, "Engineering"),
    (104, 48000, "Engineering"),
    (105, 51000, "Engineering"),
    (106, 46000, "Science"),
    (107, 38000, "Science"),
    (108, 37000, "Science"),
    (109, 42000, "Analytics"),
    (110, 55000, "Engineering"),
    (111, 39000, "Science"),
    (112, 47000, "Analytics"),
    (113, 43000, "Engineering"),
    (114, 36000, "Science")
]

# Define the schema (column names)
columns = ["emp_id", "salary", "department"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
df.show()


+------+------+-----------+
|emp_id|salary| department|
+------+------+-----------+
|   100| 40000|  Analytics|
|   101| 30000|  Analytics|
|   102| 50000|  Analytics|
|   103| 45000|Engineering|
|   104| 48000|Engineering|
|   105| 51000|Engineering|
|   106| 46000|    Science|
|   107| 38000|    Science|
|   108| 37000|    Science|
|   109| 42000|  Analytics|
|   110| 55000|Engineering|
|   111| 39000|    Science|
|   112| 47000|  Analytics|
|   113| 43000|Engineering|
|   114| 36000|    Science|
+------+------+-----------+



In [0]:

# Calculate the average salary for each department
avg_salary_other_departments = (
    df.groupBy("department")
    .agg(F.avg("salary").alias("avg_salary"))
)

# Join the original dataframe with the average salary of  departments
# and filter where the salary is greater than  the average salary
result_df = (
    df.alias("e1")
    .join(avg_salary_other_departments.alias("e2"), col("e1.department") != col("e2.department"))
    .groupBy("e1.emp_id", "e1.salary", "e1.department")
    .agg(F.avg("e2.avg_salary").alias("avg_other_department_salary"))
    .filter(col("e1.salary") >= col("avg_other_department_salary"))
    .select("e1.emp_id", "e1.salary", "e1.department")
    .orderBy("emp_id")
)

# Show the result
result_df.show()


+------+------+-----------+
|emp_id|salary| department|
+------+------+-----------+
|   102| 50000|  Analytics|
|   103| 45000|Engineering|
|   104| 48000|Engineering|
|   105| 51000|Engineering|
|   106| 46000|    Science|
|   110| 55000|Engineering|
|   112| 47000|  Analytics|
|   113| 43000|Engineering|
+------+------+-----------+



In [0]:
avg_salary_other_departments.show()

+-----------+----------+
| department|avg_salary|
+-----------+----------+
|  Analytics|   41800.0|
|Engineering|   48400.0|
|    Science|   39200.0|
+-----------+----------+



In [0]:
avg_salary_dept.show()

+-----------+----------+
| department|avg-salary|
+-----------+----------+
|  Analytics|   41800.0|
|Engineering|   48400.0|
|    Science|   39200.0|
+-----------+----------+

