In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("EmployeeComparison").getOrCreate()

In [2]:
# Sample data
data = [
    (1, 'Joe', 70000, 3),
    (2, 'Henry', 80000, 4),
    (3, 'Sam', 60000, None),
    (4, 'Max', 90000, None)
]

columns = ['Id', 'Name', 'Salary', 'ManagerId']

df = spark.createDataFrame(data,columns)

df.show()

+---+-----+------+---------+
| Id| Name|Salary|ManagerId|
+---+-----+------+---------+
|  1|  Joe| 70000|        3|
|  2|Henry| 80000|        4|
|  3|  Sam| 60000|     null|
|  4|  Max| 90000|     null|
+---+-----+------+---------+



In [3]:
df.createOrReplaceTempView('employee')

In [4]:
query = """
SELECT e1.Id, e1.Name, e1.Salary AS EmployeeSalary, e2.Name AS ManagerName, e2.Salary AS ManagerSalary
FROM Employee e1
JOIN Employee e2 ON e1.ManagerId = e2.Id
WHERE e1.Salary > e2.Salary;
 """

In [5]:
result = spark.sql(query)
result.show()

+---+----+--------------+-----------+-------------+
| Id|Name|EmployeeSalary|ManagerName|ManagerSalary|
+---+----+--------------+-----------+-------------+
|  1| Joe|         70000|        Sam|        60000|
+---+----+--------------+-----------+-------------+



in Pyspark

In [6]:
# Perform the self-join
e1 = df.alias('e1')
e2 = df.alias('e2')

In [11]:
join_df = df.alias('employee').join(df.alias('manager'),
                                    col('employee.ManagerId')==col('manager.Id'),'inner'
                                    )
join_df.show()

+---+-----+------+---------+---+----+------+---------+
| Id| Name|Salary|ManagerId| Id|Name|Salary|ManagerId|
+---+-----+------+---------+---+----+------+---------+
|  1|  Joe| 70000|        3|  3| Sam| 60000|     null|
|  2|Henry| 80000|        4|  4| Max| 90000|     null|
+---+-----+------+---------+---+----+------+---------+



In [12]:
filter_df = join_df.filter(col('employee.Salary')>col('manager.Salary'))
filter_df.show()

+---+----+------+---------+---+----+------+---------+
| Id|Name|Salary|ManagerId| Id|Name|Salary|ManagerId|
+---+----+------+---------+---+----+------+---------+
|  1| Joe| 70000|        3|  3| Sam| 60000|     null|
+---+----+------+---------+---+----+------+---------+

