In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Exercise_1") \
                .getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f3a9631b790>

## Dataset (Inline): 
employee_data

In [0]:
data = [ 
        ("Ananya", "HR", 52000), 
        ("Rahul", "Engineering", 65000), 
        ("Priya", "Engineering", 60000), 
        ("Zoya", "Marketing", 48000), 
        ("Karan", "HR", 53000), 
        ("Naveen", "Engineering", 70000), 
        ("Fatima", "Marketing", 45000) 
] 
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

## PySpark Exercises
### Exercise Set 1: Basic

1.  Display all records in the DataFrame.

+----+------+------+
|Name|Salary| Bonus|
+----+------+------+
| Ali|  4500| 450.0|
|Neha|  5200| 520.0|
|John| 61000|6100.0|
+----+------+------+



In [0]:
df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



2. Print the schema of the DataFrame.

In [0]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



3. Count total number of employees

In [0]:
print(f"Total Number of Employees: {df.count()}")

Total Number of Employees 7


### Exercise Set 2: Column Operations

4. Add a new column Bonus which is 15% of Salary.

In [0]:
df = df.withColumn("Bonus", df.Salary * 0.15)
df.show()

+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+



5. Add a new column NetPay = Salary + Bonus.

In [0]:
df = df.withColumn("NetPay", df.Salary + df.Bonus)
df.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



### Exercise Set 3: Filtering and Conditions

6. Display only employees from the “Engineering” department.

In [0]:
df.filter(df.Department == "Engineering").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



7. Display employees whose salary is greater than 60000.

In [0]:
df.filter(df.Salary > 60000).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



8. Display employees who are not in the “Marketing” department.

In [0]:
df.filter(df.Department != "Marketing").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



### Exercise Set 4: Sorting and Limiting

9. Show top 3 highest paid employees.

In [0]:
from pyspark.sql.functions import desc
df.orderBy(desc(df.Salary)).limit(3).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+



10. Sort the data by Department ascending and Salary descending.

In [0]:
df.orderBy(df.Department, desc(df.Salary)).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



### Exercise Set 5: String and Case Logic

11.  Add a new column \
Level :“Senior” if salary > 60000 \
        “Mid" if salary between 50000 and 60000 \
        “Junior” otherwise

In [0]:
from pyspark.sql.functions import when
df = df.withColumn("Level", 
                   when(df.Salary > 60000, "Senior")
                   .when((df.Salary >= 50000) & (df.Salary <= 60000), "Mid")
                   .otherwise("Junior")
                   )
df.select("Name", "Department", "Salary", "Level").show()

+------+-----------+------+------+
|  Name| Department|Salary| Level|
+------+-----------+------+------+
|ANANYA|         HR| 52000|   Mid|
| RAHUL|Engineering| 65000|Senior|
| PRIYA|Engineering| 60000|   Mid|
|  ZOYA|  Marketing| 48000|Junior|
| KARAN|         HR| 53000|   Mid|
|NAVEEN|Engineering| 70000|Senior|
|FATIMA|  Marketing| 45000|Junior|
+------+-----------+------+------+



12. Convert all names to uppercase.

In [0]:
from pyspark.sql.functions import upper
df = df.withColumn("Name", upper("Name"))
df.show()

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|ANANYA|         HR| 52000| 7800.0|59800.0|   Mid|
| RAHUL|Engineering| 65000| 9750.0|74750.0|Senior|
| PRIYA|Engineering| 60000| 9000.0|69000.0|   Mid|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|   Mid|
| KARAN|         HR| 53000| 7950.0|60950.0|   Mid|
|NAVEEN|Engineering| 70000|10500.0|80500.0|Senior|
|FATIMA|  Marketing| 45000| 6750.0|51750.0|   Mid|
+------+-----------+------+-------+-------+------+

