In [1]:
import findspark
findspark.init()
import pyspark
sc=pyspark.SparkContext(appName="MyAppName")
sc

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create Spark Session
spark = SparkSession.builder.appName("AssignmentOne").getOrCreate()


In [3]:
from pyspark.sql import Row

emp_data = [
    (1, "Amit", "Manager", "Pune", 50000, 10),
    (2, "Priya", "Analyst", "Mumbai", 25000, 20),
    (3, "Ravi", "Developer", "Delhi", 18000, 10),
    (4, "Sneha", "Manager", "Pune", 60000, 30),
    (5, "Karan", "Analyst", "Nagpur", 15000, 20)
]

emp_columns = ["eno", "ename", "designation", "city", "salary", "dno"]

emp_df = spark.createDataFrame(emp_data, emp_columns)

dept_data = [
    (10, "IT"),
    (20, "HR"),
    (30, "Finance")
]

dept_columns = ["dno", "dname"]

dept_df = spark.createDataFrame(dept_data, dept_columns)


In [4]:
# Print schema
emp_df.printSchema()
dept_df.printSchema()

root
 |-- eno: long (nullable = true)
 |-- ename: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- dno: long (nullable = true)

root
 |-- dno: long (nullable = true)
 |-- dname: string (nullable = true)



In [5]:
# Filter emp based on designation
emp_df.filter(col("designation") == "Analyst").show()

+---+-----+-----------+------+------+---+
|eno|ename|designation|  city|salary|dno|
+---+-----+-----------+------+------+---+
|  2|Priya|    Analyst|Mumbai| 25000| 20|
|  5|Karan|    Analyst|Nagpur| 15000| 20|
+---+-----+-----------+------+------+---+



In [6]:
# Filter emp based on salary
emp_df.filter(col("salary") > 20000).show()

+---+-----+-----------+------+------+---+
|eno|ename|designation|  city|salary|dno|
+---+-----+-----------+------+------+---+
|  1| Amit|    Manager|  Pune| 50000| 10|
|  2|Priya|    Analyst|Mumbai| 25000| 20|
|  4|Sneha|    Manager|  Pune| 60000| 30|
+---+-----+-----------+------+------+---+



In [7]:
# Show data of departments for female employees (assume ename ending with 'a' are female)
female_emp = emp_df.filter(col("ename").endswith("a"))
female_emp.show()

+---+-----+-----------+------+------+---+
|eno|ename|designation|  city|salary|dno|
+---+-----+-----------+------+------+---+
|  2|Priya|    Analyst|Mumbai| 25000| 20|
|  4|Sneha|    Manager|  Pune| 60000| 30|
+---+-----+-----------+------+------+---+



In [14]:
from pyspark.sql.functions import when, col

# Increase salary of managers by 10%
emp_df = emp_df.withColumn(
    "salary",
    when(col("designation") == "Manager", col("salary") * 1.1)  # +10% for Managers
    .otherwise(col("salary"))  # keep same for others
)

emp_df.show()


+---+-----+-----------+---------+-----------------+---+
|eno|ename|designation|     city|           salary|dno|
+---+-----+-----------+---------+-----------------+---+
|  1| Amit|    Manager|     Pune|55000.00000000001| 10|
|  2|Priya|    Analyst|   Mumbai|          25000.0| 20|
|  3| Ravi|  Developer|    Delhi|          18000.0| 10|
|  4|Sneha|    Manager|     Pune|          66000.0| 30|
|  5|Karan|    Analyst|   Nagpur|          15000.0| 20|
|  6|Rahul|     Tester|Bangalore|          22000.0| 10|
|  7|Meena|         HR|   Mumbai|          19000.0| 20|
|  8|Vikas|  Developer|     Pune|          17000.0| 30|
+---+-----+-----------+---------+-----------------+---+



In [10]:
# Add 3 more records
extra_data = [
    (6, "Rahul", "Tester", "Bangalore", 22000, 10),
    (7, "Meena", "HR", "Mumbai", 19000, 20),
    (8, "Vikas", "Developer", "Pune", 17000, 30)
]
extra_df = spark.createDataFrame(extra_data, emp_columns)
emp_df = emp_df.union(extra_df)

In [11]:
# Join employee and dept
emp_with_dept = emp_df.join(dept_df, "dno")
emp_with_dept.show()


+---+---+-----+-----------+---------+------+-------+
|dno|eno|ename|designation|     city|salary|  dname|
+---+---+-----+-----------+---------+------+-------+
| 10|  1| Amit|    Manager|     Pune| 50000|     IT|
| 10|  3| Ravi|  Developer|    Delhi| 18000|     IT|
| 10|  6|Rahul|     Tester|Bangalore| 22000|     IT|
| 20|  2|Priya|    Analyst|   Mumbai| 25000|     HR|
| 20|  5|Karan|    Analyst|   Nagpur| 15000|     HR|
| 20|  7|Meena|         HR|   Mumbai| 19000|     HR|
| 30|  4|Sneha|    Manager|     Pune| 60000|Finance|
| 30|  8|Vikas|  Developer|     Pune| 17000|Finance|
+---+---+-----+-----------+---------+------+-------+



In [12]:
# Department wise employees
emp_with_dept.groupBy("dname").count().show()

+-------+-----+
|  dname|count|
+-------+-----+
|     HR|    3|
|Finance|    2|
|     IT|    3|
+-------+-----+



In [13]:
# Employees with salary < 20000 and designation = 'Analyst'
emp_df.filter((col("salary") < 20000) & (col("designation") == "Analyst")).show()

+---+-----+-----------+------+------+---+
|eno|ename|designation|  city|salary|dno|
+---+-----+-----------+------+------+---+
|  5|Karan|    Analyst|Nagpur| 15000| 20|
+---+-----+-----------+------+------+---+

