In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("FirstNotebook").getOrCreate()
spark


In [0]:
from pyspark.sql import SparkSession

data = [
    (1, "Alice", "Engineering", 70000),
    (2, "Bob", "HR", 48000),
    (3, "Charlie", "Marketing", 52000),
    (4, "David", "Engineering", 63000),
    (5, "Eve", "Marketing", 45000),
    (6, "Frank", "Engineering", 59000),
]

columns = ["ID", "Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)

# Exercise Set 1: Basics
# 1. Display all records in the DataFrame.
# 2. Print the schema of the DataFrame.
# 3. Count total number of employees.

df.show()
df.printSchema()
df.count()


+---+-------+-----------+------+
| ID|   Name| Department|Salary|
+---+-------+-----------+------+
|  1|  Alice|Engineering| 70000|
|  2|    Bob|         HR| 48000|
|  3|Charlie|  Marketing| 52000|
|  4|  David|Engineering| 63000|
|  5|    Eve|  Marketing| 45000|
|  6|  Frank|Engineering| 59000|
+---+-------+-----------+------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



6

In [0]:
from pyspark.sql.functions import col

# Exercise Set 2: Column Operations
# 4. Add a new column Bonus which is 15% of Salary.
# 5. Add a new column NetPay = Salary + Bonus.

bonus = df.withColumn("Bonus", col("Salary") * 0.15)
bonus.show()

netpay = bonus.withColumn("NetPay", col("Salary") + col("Bonus"))
netpay.show()


+---+-------+-----------+------+-------+
| ID|   Name| Department|Salary|  Bonus|
+---+-------+-----------+------+-------+
|  1|  Alice|Engineering| 70000|10500.0|
|  2|    Bob|         HR| 48000| 7200.0|
|  3|Charlie|  Marketing| 52000| 7800.0|
|  4|  David|Engineering| 63000| 9450.0|
|  5|    Eve|  Marketing| 45000| 6750.0|
|  6|  Frank|Engineering| 59000| 8850.0|
+---+-------+-----------+------+-------+

+---+-------+-----------+------+-------+-------+
| ID|   Name| Department|Salary|  Bonus| NetPay|
+---+-------+-----------+------+-------+-------+
|  1|  Alice|Engineering| 70000|10500.0|80500.0|
|  2|    Bob|         HR| 48000| 7200.0|55200.0|
|  3|Charlie|  Marketing| 52000| 7800.0|59800.0|
|  4|  David|Engineering| 63000| 9450.0|72450.0|
|  5|    Eve|  Marketing| 45000| 6750.0|51750.0|
|  6|  Frank|Engineering| 59000| 8850.0|67850.0|
+---+-------+-----------+------+-------+-------+



In [0]:
# Exercise Set 3: Filtering and Conditions
# 6. Display only employees from the “Engineering” department.
# 7. Display employees whose salary is greater than 60000.
# 8. Display employees who are not in the “Marketing” department.

df.filter(col("Department") == "Engineering").show()
df.filter(col("Salary") > 60000).show()
df.filter(col("Department") != "Marketing").show()


+---+-----+-----------+------+
| ID| Name| Department|Salary|
+---+-----+-----------+------+
|  1|Alice|Engineering| 70000|
|  4|David|Engineering| 63000|
|  6|Frank|Engineering| 59000|
+---+-----+-----------+------+

+---+-----+-----------+------+
| ID| Name| Department|Salary|
+---+-----+-----------+------+
|  1|Alice|Engineering| 70000|
|  4|David|Engineering| 63000|
+---+-----+-----------+------+

+---+-----+-----------+------+
| ID| Name| Department|Salary|
+---+-----+-----------+------+
|  1|Alice|Engineering| 70000|
|  2|  Bob|         HR| 48000|
|  4|David|Engineering| 63000|
|  6|Frank|Engineering| 59000|
+---+-----+-----------+------+



In [0]:
# Exercise Set 4: Sorting and Limiting
# 9. Show top 3 highest paid employees.
# 10. Sort the data by Department ascending and Salary descending.

df.orderBy(col("Salary").desc()).show(3)
df.orderBy(col("Department").asc(), col("Salary").desc()).show()


+---+-----+-----------+------+
| ID| Name| Department|Salary|
+---+-----+-----------+------+
|  1|Alice|Engineering| 70000|
|  4|David|Engineering| 63000|
|  6|Frank|Engineering| 59000|
+---+-----+-----------+------+
only showing top 3 rows

+---+-------+-----------+------+
| ID|   Name| Department|Salary|
+---+-------+-----------+------+
|  1|  Alice|Engineering| 70000|
|  4|  David|Engineering| 63000|
|  6|  Frank|Engineering| 59000|
|  2|    Bob|         HR| 48000|
|  3|Charlie|  Marketing| 52000|
|  5|    Eve|  Marketing| 45000|
+---+-------+-----------+------+



In [0]:
# Exercise Set 5: String and Case Logic
# 11. Add a new column Level :
# “Senior” if salary > 60000
# “Mid” if salary between 50000 and 60000
# “Junior” otherwise
# 12. Convert all names to uppercase.

from pyspark.sql.functions import when, upper

level = df.withColumn(
    "Level",
    when(col("Salary") > 60000, "Senior")
    .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
    .otherwise("Junior")
)
level.show()

uppercase = df.withColumn("Name_Upper", upper(col("Name")))
uppercase.show()


+---+-------+-----------+------+------+
| ID|   Name| Department|Salary| Level|
+---+-------+-----------+------+------+
|  1|  Alice|Engineering| 70000|Senior|
|  2|    Bob|         HR| 48000|Junior|
|  3|Charlie|  Marketing| 52000|   Mid|
|  4|  David|Engineering| 63000|Senior|
|  5|    Eve|  Marketing| 45000|Junior|
|  6|  Frank|Engineering| 59000|   Mid|
+---+-------+-----------+------+------+

+---+-------+-----------+------+----------+
| ID|   Name| Department|Salary|Name_Upper|
+---+-------+-----------+------+----------+
|  1|  Alice|Engineering| 70000|     ALICE|
|  2|    Bob|         HR| 48000|       BOB|
|  3|Charlie|  Marketing| 52000|   CHARLIE|
|  4|  David|Engineering| 63000|     DAVID|
|  5|    Eve|  Marketing| 45000|       EVE|
|  6|  Frank|Engineering| 59000|     FRANK|
+---+-------+-----------+------+----------+

