In [1]:
import sys
import os
sys.path.append(os.getenv("PYTHONPATH", "/app")) #REVIEW
from utils import Utils

In [2]:
utils = Utils()
spark = utils.get_spark_session()
emp = utils.get_employee_data()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 20:14:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Pyspark functions can be accessed at: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html

In [None]:
# The case when expression can be done in spark via the when() and otherwise() functions
from pyspark.sql.functions import when, col

emp.withColumn("gender_classified", when(col("gender") == "Male", "M").when(col("gender") == "Female", "F").otherwise(None)).show()

In [None]:
# Same using the expr() function:
from pyspark.sql.functions import expr
emp_gender = emp.withColumn("gender_classified", expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end"))
emp_gender.show()

In [None]:
# The replace function can be performed using the regexp_replace() from spark
from pyspark.sql.functions import regexp_replace

emp.withColumn("altered_name", regexp_replace(col("name"), "J", "Z")).show()

In [None]:
# To convert string dates to 'date' type, the function to date can be called, receiving the column object and the date format (like yyyy-MM-dd)
from pyspark.sql.functions import to_date

emp_date = emp.withColumn("dateColumn", to_date(col("hire_date"), "yyyy-MM-dd"))
emp_date.show()
emp_date.printSchema()

In [None]:
# To update the string date format, the function date_format() can be used:
from pyspark.sql.functions import date_format

emp.withColumn("date_format", date_format(col("hire_date"), "dd/MM/yyyy")).withColumn("year_date", date_format(col("hire_date"), "yyyy")).show()

In [None]:
# There are functions to return the current date and current timestamp: current_date(), current_timestamp()
# The argument 'truncate' used in the function show() prevents the dataset showed to colapse values in order to show all columns 
from pyspark.sql.functions import current_date, current_timestamp

emp.withColumn("current_date", current_date()).withColumn("current_timestamp", current_timestamp()).show(5, truncate=False)

In [None]:
# To handle null values, the alias 'na' can be called on dataframes and then other functions can be used on top.
# The drop function is an example of removing records in which any column is a null value:
emp_gender.na.drop().show()

In [None]:
# The coalesce operation can be executed through the coalesce() function. It can be a better alternative to dealing with null values:
from pyspark.sql.functions import coalesce, col, lit

emp_gender.withColumn("gender_classified", coalesce(col("gender_classified"), lit("-"))).show()