In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("EmployeeData").getOrCreate()
spark


In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
empdf = spark.createDataFrame(data, columns)
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
perfdf = spark.createDataFrame(performance, columns_perf)


In [0]:
# GroupBy and Aggregations
# 1. Get the average salary by department.
# 2. Count of employees per department.
# 3. Maximum and minimum salary in Engineering.
from pyspark.sql.functions import col, max, min 
empdf.groupBy("Department").avg("Salary").show()
empdf.groupBy("Department").count().show()
empdf.filter(col("Department") == "Engineering").agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()


+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|         HR|    52500.0|
|Engineering|    65000.0|
|  Marketing|    46500.0|
+-----------+-----------+

+-----------+-----+
| Department|count|
+-----------+-----+
|         HR|    2|
|Engineering|    3|
|  Marketing|    2|
+-----------+-----+

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



In [0]:
# Join and Combine Data
# 4. Perform an inner join between employee_data and performance_data on Name .
# 5. Show each employee’s salary and performance rating.
# 6. Filter employees with rating > 4.5 and salary > 60000.
from pyspark.sql.functions import col
joined = empdf.join(perfdf, on="Name", how="inner")
joined.show()
joined.select("Name", "Salary", "Rating").show()
joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



In [0]:
# Window & Rank (Bonus Challenge)
# 7. Rank employees by salary department-wise.
# 8. Calculate cumulative salary in each department.
from pyspark.sql.functions import rank,_sum
from pyspark.sql.window import Window
rank = Window.partitionBy("Department").orderBy(col("Salary").desc())
empdf.withColumn("Rank", rank().over(rank)).show()
cumulative = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)
empdf.withColumn("CumulativeSalary", _sum("Salary").over(cumulative)).show()



+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
| Priya|Engineering| 60000|           60000|
| Rahul|Engineering| 65000|          125000|
|Naveen|Engineering| 70000|          195000|
|Ananya|         HR| 52000|           52000|
| Karan|         HR| 53000|          105000|
|Fatima|  Marketing| 45000|           45000|
|  Zoya|  Marketing| 48000|           93000|
+------+-----------+------+----------------+



In [0]:
# Date Operations
# 9. Add a new column JoinDate with random dates between 2020 and 2023.
# 10. Add column YearsWithCompany using current_date() and datediff() .
from pyspark.sql.functions import current_date, datediff, monotonically_increasing_id
import random
from datetime import date, timedelta

def random_date():
    start_date = date(2020, 1, 1)
    end_date = date(2023, 12, 31)
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)
 
 
join_dates = [(random_date(),) for _ in range(empdf.count())]
date_df = spark.createDataFrame(join_dates, ["JoinDate"]).withColumn("id", monotonically_increasing_id())
df_emp_id = empdf.withColumn("id", monotonically_increasing_id())
df_emp_with_date = df_emp_id.join(date_df, on="id").drop("id")

df_emp_with_date.show()



+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|         HR| 52000|2023-10-19|
| Rahul|Engineering| 65000|2023-01-02|
| Priya|Engineering| 60000|2020-10-07|
|  Zoya|  Marketing| 48000|2020-04-07|
| Karan|         HR| 53000|2022-09-30|
|Naveen|Engineering| 70000|2020-07-13|
|Fatima|  Marketing| 45000|2021-01-23|
+------+-----------+------+----------+



In [0]:
# Writing to Files
# 11. Write the full employee DataFrame to CSV with headers.
# 12. Save the joined DataFrame to a Parquet file.

empdf.write.option("header", True).csv("output/employee_data_csv", mode="overwrite")
joined.write.parquet("output/employee_performance_parquet", mode="overwrite")


