In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [3]:
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

Exercise Set 1: Basics

In [4]:
# 1. Display all records in the DataFrame.
df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [5]:
# 2. Print the schema of the DataFrame.
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [6]:
# 3. Count total number of employees.
df.count()

7

Exercise Set 2: Column Operations

In [8]:
# 4. Add a new column Bonus which is 15% of Salary.
from pyspark.sql.functions import col
df_bonus = df.withColumn("Bonus", col("Salary") * 0.15)
df_bonus.show()

+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+



In [9]:
# 5. Add a new column NetPay = Salary + Bonus.
df_netpay = df_bonus.withColumn("NetPay", col("Salary") + col("Bonus"))
df_netpay.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



Exercise Set 3: Filtering and Conditions

In [10]:
# 6. Display only employees from the “Engineering” department.
df.filter(col("Department") == "Engineering").show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|Naveen|Engineering| 70000|
+------+-----------+------+



In [11]:
# 7. Display employees whose salary is greater than 60000.
df.filter(col("Salary") > 60000).show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
|Naveen|Engineering| 70000|
+------+-----------+------+



In [12]:
# 8. Display employees who are not in the “Marketing” department.
df.filter(col("Department") != "Marketing").show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
+------+-----------+------+



Exercise Set 4: Sorting and Limiting

In [13]:
# 9. Show top 3 highest paid employees.
df.orderBy(col("Salary").desc()).show(3)

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
+------+-----------+------+
only showing top 3 rows



In [14]:
# 10. Sort the data by Department ascending and Salary descending.
df.orderBy(col("Department").asc(), col("Salary").desc()).show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Ananya|         HR| 52000|
|  Zoya|  Marketing| 48000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



Exercise Set 5: String and Case Logic

In [17]:
# 11. Add a new column Level :
# “Senior” if salary > 60000
# “Mid” if salary between 50000 and 60000
# “Junior” otherwise
from pyspark.sql.functions import when
df_level = df.withColumn("Level", when(col("Salary") > 60000, "Senior")
                        .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
                        .otherwise("Junior"))
df_level.show()

+------+-----------+------+------+
|  Name| Department|Salary| Level|
+------+-----------+------+------+
|Ananya|         HR| 52000|   Mid|
| Rahul|Engineering| 65000|Senior|
| Priya|Engineering| 60000|   Mid|
|  Zoya|  Marketing| 48000|Junior|
| Karan|         HR| 53000|   Mid|
|Naveen|Engineering| 70000|Senior|
|Fatima|  Marketing| 45000|Junior|
+------+-----------+------+------+



In [16]:
# 12. Convert all names to uppercase.
from pyspark.sql.functions import upper
df_upper = df_level.withColumn("Name_Upper", upper(col("Name")))
df_upper.show()

+------+-----------+------+------+----------+
|  Name| Department|Salary| Level|Name_Upper|
+------+-----------+------+------+----------+
|Ananya|         HR| 52000|   Mid|    ANANYA|
| Rahul|Engineering| 65000|Senior|     RAHUL|
| Priya|Engineering| 60000|   Mid|     PRIYA|
|  Zoya|  Marketing| 48000|Junior|      ZOYA|
| Karan|         HR| 53000|   Mid|     KARAN|
|Naveen|Engineering| 70000|Senior|    NAVEEN|
|Fatima|  Marketing| 45000|Junior|    FATIMA|
+------+-----------+------+------+----------+

