**Intialization of SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Employee Data Analysis") \
    .getOrCreate()
spark


**Creating Data Manually**

In [0]:
data = [ 
    ("Ananya", "HR", 52000), 
    ("Rahul", "Engineering", 65000), 
    ("Priya", "Engineering", 60000), 
    ("Zoya", "Marketing", 48000), 
    ("Karan", "HR", 53000), 
    ("Naveen", "Engineering", 70000), 
    ("Fatima", "Marketing", 45000) 
]
columns = ["Name", "Department", "Salary"]
# Create DataFrame
df = spark.createDataFrame(data, columns)

**Exercise Set 1: Basics**

In [0]:
# 1.Display all records in the DataFrame
df.show()
# 2.Print the schema of the DataFrame
df.printSchema()
# 3.Count total number of employees
print("Total employees:", df.count())

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

Total employees: 7


**Exercise Set 2: Column Operations**

In [0]:
from pyspark.sql.functions import col, round
# 4.Add a new column Bonus which is 15% of Salary.
bonus = df.withColumn("Bonus", round(col("Salary") * 0.15, 2))
bonus.show()
# 5.Add a new column NetPay = Salary + Bonus
netpay = bonus.withColumn("NetPay", col("Salary") + col("Bonus"))
netpay.show()

+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



**Exercise Set 3: Filtering and Conditions**

In [0]:
# 6.Display only employees from the “Engineering” department
print("Employees from Engineering")
netpay.filter(col("Department") == "Engineering").show()
# 7.Display employees whose salary is greater than 60000
print("Employees with Salary > 60000")
netpay.filter(col("Salary") > 60000).show()
# 8.Display employees who are not in the “Marketing” department
print("Employees not in Marketing")
netpay.filter(col("Department") != "Marketing").show()

Employees from Engineering
+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+

Employees with Salary > 60000
+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+

Employees not in Marketing
+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0

**Exercise Set 4: Sorting and Limiting**

In [0]:
# 9.Show top 3 highest paid employees
print("Top 3 highest paid employees")
netpay.orderBy(col("Salary").desc()).show(3)
# 10.Sort the data by Department ascending and Salary descending
print("Sort by Department ASC and Salary DESC")
netpay.orderBy(col("Department").asc(), col("Salary").desc()).show()

Top 3 highest paid employees
+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+
only showing top 3 rows

Sort by Department ASC and Salary DESC
+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



**Exercise Set 5: String and Case Logic**

In [0]:
from pyspark.sql.functions import when, upper
# 11. Add Level column based on Salary
level =netpay.withColumn("Level", 
    when(col("Salary") > 60000, "Senior")
    .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
    .otherwise("Junior")
)
print("Add Level column based on Salary")
level.show()
# 12. Convert all names to uppercase
final = level.withColumn("Name", upper(col("Name")))
print("Converted all names to uppercase")
final.show()



Add Level column based on Salary
+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|Ananya|         HR| 52000| 7800.0|59800.0|   Mid|
| Rahul|Engineering| 65000| 9750.0|74750.0|Senior|
| Priya|Engineering| 60000| 9000.0|69000.0|   Mid|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|Junior|
| Karan|         HR| 53000| 7950.0|60950.0|   Mid|
|Naveen|Engineering| 70000|10500.0|80500.0|Senior|
|Fatima|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+

Converted all names to uppercase
+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|ANANYA|         HR| 52000| 7800.0|59800.0|   Mid|
| RAHUL|Engineering| 65000| 9750.0|74750.0|Senior|
| PRIYA|Engineering| 60000| 9000.0|69000.0|   Mid|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|Junior|
| KARAN|       