In [None]:
Intermediate-Level (Aggregations & Joins)
Aggregations:
    
    Find the average salary per department.
    Find the highest and lowest salary in each department.
    Count the number of employees per department.

Joins (Create another DataFrame for department details like dept_id, dept_name, location):

    Join the employee DataFrame with department details based on dept_id.
    Find employees who belong to the "Finance" department.


Grouping & Sorting:

    Get the total salary expense per department.
    Sort employees by salary in descending order.
    Sort employees by department first, then by salary.

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg,min,max,count
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder.appName("aggrJoin").getOrCreate()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
employee_data = [
    ("HR", "Alice", 60000),
    ("HR", "Bob", 75000),
    ("HR", "Charlie", 80000),
    ("HR", "David", 72000),
    ("IT", "Eve", 90000),
    ("IT", "Frank", 85000),
    ("IT", "Grace", 95000),
    ("IT", "Hank", 87000),
    ("Finance", "Ivy", 70000),
    ("Finance", "Jack", 65000),
    ("Finance", "Kevin", 72000),
    ("Finance", "Liam", 71000),
]

# Creating DataFrame
columns = ["department", "employee", "salary"]

In [6]:
emp_df = spark.createDataFrame(employee_data,columns)

In [7]:
emp_df.createOrReplaceTempView("employee_data")

In [None]:
Aggregations:
    
    Find the average salary per department.
    Find the highest and lowest salary in each department.
    Count the number of employees per department.

In [8]:
emp_df.printSchema()

root
 |-- department: string (nullable = true)
 |-- employee: string (nullable = true)
 |-- salary: long (nullable = true)



In [10]:
emp_df.head(2)

[Row(department='HR', employee='Alice', salary=60000),
 Row(department='HR', employee='Bob', salary=75000)]

In [15]:
#Find the average salary per department.

spark.sql("""
    select department,avg(salary) as avg_sal from employee_data group by department
""").show()

+----------+-------+
|department|avg_sal|
+----------+-------+
|        HR|71750.0|
|        IT|89250.0|
|   Finance|69500.0|
+----------+-------+



In [16]:
#Find the average salary per department.

emp_df.groupBy("department").agg(avg("salary").alias("avg_sal")).show()

+----------+-------+
|department|avg_sal|
+----------+-------+
|        HR|71750.0|
|        IT|89250.0|
|   Finance|69500.0|
+----------+-------+



In [19]:
#Find the highest and lowest salary in each department.
spark.sql("""
        select department,min(salary) as min_sal,max(salary) as max_sal from employee_data group by department
""").show()

+----------+-------+-------+
|department|min_sal|max_sal|
+----------+-------+-------+
|        HR|  60000|  80000|
|        IT|  85000|  95000|
|   Finance|  65000|  72000|
+----------+-------+-------+



In [22]:
#Find the highest and lowest salary in each department.
emp_df.groupBy("department").agg(min("salary").alias("min_sal"),max("salary").alias("max_sal")).show()

+----------+-------+-------+
|department|min_sal|max_sal|
+----------+-------+-------+
|        HR|  60000|  80000|
|        IT|  85000|  95000|
|   Finance|  65000|  72000|
+----------+-------+-------+



In [24]:
#Count the number of employees per department.
spark.sql("""
        select department,count(employee) as no_of_employee from employee_data group by department
""").show()

+----------+--------------+
|department|no_of_employee|
+----------+--------------+
|        HR|             4|
|        IT|             4|
|   Finance|             4|
+----------+--------------+



In [25]:
#Count the number of employees per department.
emp_df.groupBy("department").agg(count("employee").alias("no_of_employee")).show()

+----------+--------------+
|department|no_of_employee|
+----------+--------------+
|        HR|             4|
|        IT|             4|
|   Finance|             4|
+----------+--------------+



In [26]:
emp_df.groupBy("department").agg(count("employee")).show()

+----------+---------------+
|department|count(employee)|
+----------+---------------+
|        HR|              4|
|        IT|              4|
|   Finance|              4|
+----------+---------------+



In [None]:
Get the total salary expense per department.
    Sort employees by salary in descending order.
    Sort employees by department first, then by salary.

In [27]:
#Get the total salary expense per department.
spark.sql("""
    select department,sum(salary) as expense from employee_data group by department
""").show()

+----------+-------+
|department|expense|
+----------+-------+
|        HR| 287000|
|        IT| 357000|
|   Finance| 278000|
+----------+-------+



In [31]:
from pyspark.sql.functions import sum

In [32]:
#Get the total salary expense per department.
emp_df.groupBy("department").agg(sum("salary").alias("expense")).show()

+----------+-------+
|department|expense|
+----------+-------+
|        HR| 287000|
|        IT| 357000|
|   Finance| 278000|
+----------+-------+



In [33]:
emp_df.printSchema()

root
 |-- department: string (nullable = true)
 |-- employee: string (nullable = true)
 |-- salary: long (nullable = true)



In [34]:
#Sort employees by salary in descending order.
spark.sql("""
        select department,employee,salary from employee_data order by salary desc
""").show()

+----------+--------+------+
|department|employee|salary|
+----------+--------+------+
|        IT|   Grace| 95000|
|        IT|     Eve| 90000|
|        IT|    Hank| 87000|
|        IT|   Frank| 85000|
|        HR| Charlie| 80000|
|        HR|     Bob| 75000|
|        HR|   David| 72000|
|   Finance|   Kevin| 72000|
|   Finance|    Liam| 71000|
|   Finance|     Ivy| 70000|
|   Finance|    Jack| 65000|
|        HR|   Alice| 60000|
+----------+--------+------+



In [36]:
#Sort employees by salary in descending order.
emp_df.orderBy(col("salary").desc()).show()

+----------+--------+------+
|department|employee|salary|
+----------+--------+------+
|        IT|   Grace| 95000|
|        IT|     Eve| 90000|
|        IT|    Hank| 87000|
|        IT|   Frank| 85000|
|        HR| Charlie| 80000|
|        HR|     Bob| 75000|
|        HR|   David| 72000|
|   Finance|   Kevin| 72000|
|   Finance|    Liam| 71000|
|   Finance|     Ivy| 70000|
|   Finance|    Jack| 65000|
|        HR|   Alice| 60000|
+----------+--------+------+



In [37]:
#Sort employees by department first, then by salary.
spark.sql("""
        select department,employee,salary from employee_data order by department,salary 
""").show()

+----------+--------+------+
|department|employee|salary|
+----------+--------+------+
|   Finance|    Jack| 65000|
|   Finance|     Ivy| 70000|
|   Finance|    Liam| 71000|
|   Finance|   Kevin| 72000|
|        HR|   Alice| 60000|
|        HR|   David| 72000|
|        HR|     Bob| 75000|
|        HR| Charlie| 80000|
|        IT|   Frank| 85000|
|        IT|    Hank| 87000|
|        IT|     Eve| 90000|
|        IT|   Grace| 95000|
+----------+--------+------+



In [38]:
#Sort employees by department first, then by salary.
emp_df.orderBy(col("department"),col("salary")).show()

+----------+--------+------+
|department|employee|salary|
+----------+--------+------+
|   Finance|    Jack| 65000|
|   Finance|     Ivy| 70000|
|   Finance|    Liam| 71000|
|   Finance|   Kevin| 72000|
|        HR|   Alice| 60000|
|        HR|   David| 72000|
|        HR|     Bob| 75000|
|        HR| Charlie| 80000|
|        IT|   Frank| 85000|
|        IT|    Hank| 87000|
|        IT|     Eve| 90000|
|        IT|   Grace| 95000|
+----------+--------+------+



In [None]:
Joins (Create another DataFrame for department details like dept_id, dept_name, location):

    Join the employee DataFrame with department details based on dept_id.
    Find employees who belong to the "Finance" department.

In [39]:
emp_data_new = [
    (1, "Alice", 101, 50000),
    (2, "Bob", 102, 60000),
    (3, "Charlie", 101, 70000),
    (4, "David", 103, 80000)
]
emp_columns_new = ["emp_id", "name", "dept_id", "salary"]
employees = spark.createDataFrame(emp_data_new, emp_columns_new)

In [40]:
employees.createOrReplaceTempView("employees")

In [41]:
employees.printSchema()

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept_id: long (nullable = true)
 |-- salary: long (nullable = true)



In [42]:
# Sample Department Data (dept_id, dept_name)
dept_data = [
    (101, "HR"),
    (102, "IT"),
    (103, "Finance"),
    (104, "Marketing")  # Unmatched department
]
dept_columns = ["dept_id", "dept_name"]
departments = spark.createDataFrame(dept_data, dept_columns)

In [43]:
departments.printSchema()

root
 |-- dept_id: long (nullable = true)
 |-- dept_name: string (nullable = true)



In [53]:
employees.join(departments,on="dept_id", how="inner").show()

+-------+------+-------+------+---------+
|dept_id|emp_id|   name|salary|dept_name|
+-------+------+-------+------+---------+
|    101|     1|  Alice| 50000|       HR|
|    101|     3|Charlie| 70000|       HR|
|    102|     2|    Bob| 60000|       IT|
|    103|     4|  David| 80000|  Finance|
+-------+------+-------+------+---------+



In [48]:
# Sample Department Data (dept_id, dept_name)
dept_data_new = [
    (101, "HR"),
    (102, "IT"),
    (103, "Finance"),
    (104, "Marketing")  # Unmatched department
]
dept_columns_new = ["department_id", "dept_name"]
departments_new = spark.createDataFrame(dept_data_new, dept_columns_new)

In [50]:
departments_new.printSchema()

root
 |-- department_id: long (nullable = true)
 |-- dept_name: string (nullable = true)



In [51]:
# Join on different column names
emp_dept_df = employees.join(departments_new, employees.dept_id == departments_new.department_id, "inner") \
                       .select("emp_id", "name", "salary", "dept_name")

# Show Result
emp_dept_df.show()

+------+-------+------+---------+
|emp_id|   name|salary|dept_name|
+------+-------+------+---------+
|     1|  Alice| 50000|       HR|
|     3|Charlie| 70000|       HR|
|     2|    Bob| 60000|       IT|
|     4|  David| 80000|  Finance|
+------+-------+------+---------+



In [54]:
  #Find employees who belong to the "Finance" department.
emp_dept_df_withoutSelect = employees.join(departments_new, employees.dept_id == departments_new.department_id, "inner") 

In [55]:
emp_dept_df_withoutSelect.show()

+------+-------+-------+------+-------------+---------+
|emp_id|   name|dept_id|salary|department_id|dept_name|
+------+-------+-------+------+-------------+---------+
|     1|  Alice|    101| 50000|          101|       HR|
|     3|Charlie|    101| 70000|          101|       HR|
|     2|    Bob|    102| 60000|          102|       IT|
|     4|  David|    103| 80000|          103|  Finance|
+------+-------+-------+------+-------------+---------+



In [57]:
emp_dept_df_withoutSelect.filter(col("dept_name") == "Finance").show()

+------+-----+-------+------+-------------+---------+
|emp_id| name|dept_id|salary|department_id|dept_name|
+------+-----+-------+------+-------------+---------+
|     4|David|    103| 80000|          103|  Finance|
+------+-----+-------+------+-------------+---------+

