In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataxBootcamp").getOrCreate()


In [0]:
# a small example for lazy evaluation
data ={

    ("Alice","IT",50000),
    ("Bob","HR",40000),
    ("Charlie","IT",60000),
    ("David","HR",55000),
    ("Eve","IT",70000)
}

column = ["Name","Department","Salary"]
df = spark.createDataFrame(data,column)

In [0]:
# only on using display a spark job will run

display(df)

In [0]:
# select and filter on the dataframe created
display(df.select("Salary","Name"))

In [0]:
# Method 1 - filter based on salary > 50000
display(df.filter(df.Salary > 50000))

In [0]:
# Method 2 - salary > 50000
display(df.filter(df['Salary'] > 50000))

In [0]:
# filter people who are in HR and whose salary > 30000

display(df.filter((df['Department'] == 'HR') & (df['Salary'] > 30000))) 

In [0]:
# create a derived column - column created using anothe column
from pyspark.sql.functions import col

# give 10% bonus to everyone
df = df.withColumn("Bonus",df.Salary * 0.1)
display(df)

In [0]:
# group by and aggregation

# group by department and average of salary

df.groupBy("Department").avg("Salary").show()

In [0]:
# apply multiple aggregations

from pyspark.sql.functions import min, max, avg

display(df.groupby("Department").agg(
    min("Salary").alias("Min_salary"),
    max("Salary").alias("Max_salary"),
    avg("Salary").alias("Avg_salary")))

In [0]:
# join 2 spark dataframes
# creating another dataframe

dept_data = {("HR",101) , ("IT", 102)}
dept_column = ["Department", "Dept_id"]
df_dept = spark.createDataFrame(dept_data, dept_column)
display(df_dept)

In [0]:
# join df and df_dept
df_joined = df.join(df_dept, on="Department", how = "inner")
display(df_joined)

In [0]:
# transformation vs Action

#transformation - only does the operation
filtered = df.filter(df.Salary > 40000)

#action - result will be visible only on using action statements
filtered.show()

Student practice assignment

📂 Build a DataFrame with the following schema: Name, Department, Salary, Location

In [0]:
# Sample data
data = [
    ("Anurag", "IT", 70000, "Bangalore"),
    ("Priya", "HR", 55000, "Mumbai"),
    ("Ravi", "Finance", 65000, "Delhi"),
    ("Sneha", "IT", 60000, "Hyderabad")
]

columns = ["Name", "Department", "Salary", "Location"]

df_task = spark.createDataFrame(data, columns)
display(df_task)

In [0]:
#task 1 :Filter IT employees with salary > 60K
display(df_task.filter((df_task.Department == "IT") & (df_task.Salary > 60000)))


In [0]:
# Task 2: Add column "Hike_Amount" as 15% of salary
from pyspark.sql.functions import col

df_task = df_task.withColumn("Hike Amount", df_task.Salary *0.15)
display(df_task)

In [0]:
# another approach
df_task = df_task.withColumn("Hike_Amount", col("Salary") * 0.15)
df_task.show()

In [0]:
# Task 3: Group by Department and show average salary
df_task.groupby("Department").avg("Salary").show()