In [3]:
import sys
import os
sys.path.append(os.getenv("PYTHONPATH", "/app")) #REVIEW
from utils import Utils

In [4]:
utils = Utils()
spark = utils.get_spark_session()
emp = utils.get_employee_data()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/07 18:59:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# # Two dataframes are needed for the union so the limit() and subtract() functions can be used to filter half of the records from the employee data and then create a second dataframe
# subtracting the records from the first one:
# from pyspark.sql.functions import sub
emp_data_1 = emp.limit(10)
emp_data_2 = emp.subtract(emp_data_1)

emp_data_1.show()
emp_data_2.show()

In [None]:
# The union operation can be realized by the union() function. Just like in SQL, the column type and sequence must be the same in both datasets.
union_emp = emp_data_1.union(emp_data_2)
union_emp.show()

In [None]:
# union() will insert all records even if they are repeated between the datasets. The distinct() function can then be used to filter out distinct records:
emp_data_1_union = emp_data_1.union(emp_data_1)
# emp_data_1_union.show()
emp_data_1_union.distinct().show()

In [None]:
# The union by name behaviour is executed through the spark unionByName function:
emp_data_2_unordered = emp_data_2.select("department_id", "gender", "age", "name", "salary", "hire_date", "employee_id")

# emp_data_1.union(emp_data_2_unordered).show() # Wrong union

emp_data_1.unionByName(emp_data_2_unordered).show()

In [None]:
# To perform an order by operation, the orderBy() function can be called with the method .asc or .desc on the column:
from pyspark.sql.functions import col
emp_data_1.orderBy(col("salary").desc()).show()
emp_data_2.orderBy(col("hire_date").asc()).show()

In [None]:
# For aggregation, there are different functions available. First the groupBy function must be called in some column(s) and then the .agg()
# Using the count() function to get the total employees per department id:
# The alias() function can be used to rename the aggregated column. Otherwise, the column will be called function(column_name)
from pyspark.sql.functions import count

union_emp.groupBy("department_id").agg(count("employee_id").alias("number of employees")).orderBy(col("number of employees").desc()).show()

In [7]:
# There is also a count() function over the dataframe, which will count the total number of records
union_emp.count()

20

In [None]:
# emp_salary_double = union_emp.withColumn("salary", col("salary").cast("double"))
# emp_salary_double.printSchema()
from pyspark.sql.functions import sum # Not importing this function will cause the code to interpret it as the python built in sum() function
union_emp.groupBy("department_id").agg(sum("salary").alias("department_salary")).show()


In [None]:
# To filter data based on an aggregation, the function where() can still be used:
# Filter 
union_emp.groupBy("department_id").agg(sum("salary").alias("department_salary")).where("department_salary > 50000").orderBy(col("department_salary").desc()).show()