In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeSalesAnalysis").getOrCreate()
data = [
    ("E101", "Anit", "Sales", 10, 1200),
    ("E162", "Sneha", "Marketing", 8, 1500),
    ("E103", "Ravi", "Sales", 12, 1300),
    ("E104", "Anjali", "HR", 7, 1100),
    ("E105", "Raj", "Sales", 5, 1000)
]

columns = ["EmpID", "Name", "Department", "UnitsSold", "UnitPrice"]

df = spark.createDataFrame(data, columns)

df.show()


+-----+------+----------+---------+---------+
|EmpID|  Name|Department|UnitsSold|UnitPrice|
+-----+------+----------+---------+---------+
| E101|  Anit|     Sales|       10|     1200|
| E162| Sneha| Marketing|        8|     1500|
| E103|  Ravi|     Sales|       12|     1300|
| E104|Anjali|        HR|        7|     1100|
| E105|   Raj|     Sales|        5|     1000|
+-----+------+----------+---------+---------+



In [2]:
total_employees = df.count()
print(f"Total number of employees: {total_employees}")


Total number of employees: 5


In [3]:
from pyspark.sql.functions import col
df = df.withColumn("TotalSales", col("UnitsSold") * col("UnitPrice"))
df.show()


+-----+------+----------+---------+---------+----------+
|EmpID|  Name|Department|UnitsSold|UnitPrice|TotalSales|
+-----+------+----------+---------+---------+----------+
| E101|  Anit|     Sales|       10|     1200|     12000|
| E162| Sneha| Marketing|        8|     1500|     12000|
| E103|  Ravi|     Sales|       12|     1300|     15600|
| E104|Anjali|        HR|        7|     1100|      7700|
| E105|   Raj|     Sales|        5|     1000|      5000|
+-----+------+----------+---------+---------+----------+



In [4]:
df.filter((col("Department") == "Sales") & (col("TotalSales") > 12000)).show()


+-----+----+----------+---------+---------+----------+
|EmpID|Name|Department|UnitsSold|UnitPrice|TotalSales|
+-----+----+----------+---------+---------+----------+
| E103|Ravi|     Sales|       12|     1300|     15600|
+-----+----+----------+---------+---------+----------+



In [5]:
from pyspark.sql.functions import desc

df.orderBy(desc("TotalSales")).limit(1).show()


+-----+----+----------+---------+---------+----------+
|EmpID|Name|Department|UnitsSold|UnitPrice|TotalSales|
+-----+----+----------+---------+---------+----------+
| E103|Ravi|     Sales|       12|     1300|     15600|
+-----+----+----------+---------+---------+----------+



In [6]:
df.orderBy(col("TotalSales").desc()).show()


+-----+------+----------+---------+---------+----------+
|EmpID|  Name|Department|UnitsSold|UnitPrice|TotalSales|
+-----+------+----------+---------+---------+----------+
| E103|  Ravi|     Sales|       12|     1300|     15600|
| E101|  Anit|     Sales|       10|     1200|     12000|
| E162| Sneha| Marketing|        8|     1500|     12000|
| E104|Anjali|        HR|        7|     1100|      7700|
| E105|   Raj|     Sales|        5|     1000|      5000|
+-----+------+----------+---------+---------+----------+

