In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType
from datetime import datetime

# Create SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeSQLPractice").getOrCreate()

# 1. Employees Table
employees_data = [
    (1, 'Alice', '2018-06-15', 'IT'),
    (2, 'Bob', '2019-02-10', 'Finance'),
    (3, 'Charlie', '2017-09-20', 'HR'),
    (4, 'David', '2020-01-05', 'IT'),
    (5, 'Eve', '2016-07-30', 'Finance'),
    (6, 'Sumit', '2016-06-30', 'Finance'),
]

employees_schema = StructType([
    StructField("employee_id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("join_date", DateType(), False),
    StructField("department", StringType(), False),
])

employees_data_typed = [
    (emp_id, name, datetime.strptime(join_date, "%Y-%m-%d").date(), dept)
    for emp_id, name, join_date, dept in employees_data
]

employees_df = spark.createDataFrame(employees_data_typed, employees_schema)
employees_df.createOrReplaceTempView("employees")


# 2. Salary History Table
salary_data = [
    (1, '2018-06-15', 50000, 'No'),
    (1, '2019-08-20', 55000, 'No'),
    (1, '2021-02-10', 70000, 'Yes'),
    (2, '2019-02-10', 48000, 'No'),
    (2, '2020-05-15', 52000, 'Yes'),
    (2, '2023-01-25', 68000, 'Yes'),
    (3, '2017-09-20', 60000, 'No'),
    (3, '2019-12-10', 65000, 'No'),
    (3, '2022-06-30', 72000, 'Yes'),
    (4, '2020-01-05', 45000, 'No'),
    (4, '2021-07-18', 49000, 'No'),
    (5, '2016-07-30', 55000, 'No'),
    (5, '2018-11-22', 62000, 'Yes'),
    (5, '2021-09-10', 75000, 'Yes'),
    (6, '2016-06-30', 55000, 'No'),
    (6, '2017-11-22', 50000, 'No'),
    (6, '2018-11-22', 40000, 'No'),
    (6, '2021-09-10', 75000, 'Yes'),
]

salary_schema = StructType([
    StructField("employee_id", IntegerType(), False),
    StructField("change_date", DateType(), False),
    StructField("salary", DoubleType(), False),
    StructField("promotion", StringType(), True),
])

salary_data_typed = [
    (emp_id, datetime.strptime(change_date, "%Y-%m-%d").date(), float(salary), promo)
    for emp_id, change_date, salary, promo in salary_data
]

salary_df = spark.createDataFrame(salary_data_typed, salary_schema)
salary_df.createOrReplaceTempView("salary_history")


In [5]:
spark.sql("""
    select * from salary_history
""").show()

+-----------+-----------+-------+---------+
|employee_id|change_date| salary|promotion|
+-----------+-----------+-------+---------+
|          1| 2018-06-15|50000.0|       No|
|          1| 2019-08-20|55000.0|       No|
|          1| 2021-02-10|70000.0|      Yes|
|          2| 2019-02-10|48000.0|       No|
|          2| 2020-05-15|52000.0|      Yes|
|          2| 2023-01-25|68000.0|      Yes|
|          3| 2017-09-20|60000.0|       No|
|          3| 2019-12-10|65000.0|       No|
|          3| 2022-06-30|72000.0|      Yes|
|          4| 2020-01-05|45000.0|       No|
|          4| 2021-07-18|49000.0|       No|
|          5| 2016-07-30|55000.0|       No|
|          5| 2018-11-22|62000.0|      Yes|
|          5| 2021-09-10|75000.0|      Yes|
|          6| 2016-06-30|55000.0|       No|
|          6| 2017-11-22|50000.0|       No|
|          6| 2018-11-22|40000.0|       No|
|          6| 2021-09-10|75000.0|      Yes|
+-----------+-----------+-------+---------+



In [6]:
spark.sql("""
    select * from employees
""").show()

+-----------+-------+----------+----------+
|employee_id|   name| join_date|department|
+-----------+-------+----------+----------+
|          1|  Alice|2018-06-15|        IT|
|          2|    Bob|2019-02-10|   Finance|
|          3|Charlie|2017-09-20|        HR|
|          4|  David|2020-01-05|        IT|
|          5|    Eve|2016-07-30|   Finance|
|          6|  Sumit|2016-06-30|   Finance|
+-----------+-------+----------+----------+



In [32]:
spark.sql(
"""
        with cte as (
        select *, 
        rank() over(partition by employee_id order by change_date desc) as rn
        from salary_history)
        
        , latest_salary as (
        select employee_id, salary as latest_salary
        from cte where rn = 1)
        
        , promotion as (
        select employee_id, count(*) as no_of_promotions from cte
        where promotion = 'Yes'
        group by employee_id)
        
        , previous_salary_cte as (
        select *,
        LEAD(salary, 1) over (partition by employee_id order by change_date desc) as previous_salary
        from cte)
        
        , max_salary_cte as (
        select employee_id,
        max(cast((salary - previous_salary)*100.0/previous_salary AS decimal(4,2))) as max_salary_growth
        from previous_salary_cte
        group by employee_id)
        
        select employee_id,
        CASE WHEN MIN(CASE WHEN salary < previous_salary THEN 'No' ELSE 'Yes' END) = 'No' THEN 'No' ELSE 'Yes' END as NeverDecreased 
        from previous_salary_cte
        group by employee_id

        
""").show()

+-----------+--------------+
|employee_id|NeverDecreased|
+-----------+--------------+
|          1|           Yes|
|          2|           Yes|
|          3|           Yes|
|          4|           Yes|
|          5|           Yes|
|          6|            No|
+-----------+--------------+

