In [0]:
orders = spark.read.json('/public/retail_db_json/orders')

In [0]:
# UDF (User Defined Function)

# Registering new function
dc = spark.udf.register('date_convert', lambda d: int(d[:10].replace('-', '')))

# Usage in basic syntax
orders.select(dc('order_date').alias('order_date')).show(n=5)

# Usage in SQL and selectExpr
spark.sql("""
    SELECT o.*, date_convert(order_date) AS order_date_as_int
    FROM orders AS o
""").show(n=5, truncate=False)

In [0]:
# We are able to define new spark-friendly functions with .register(name, func):

dc = spark.udf.register('date_convert', lambda d: int(d[:10].replace('-', '')))

In [0]:
# In basic spark syntax we would use new variable name

orders.show(n=5, truncate=False)
orders.select(dc('order_date').alias('order_date')).show(n=5)

+-----------------+---------------------+--------+---------------+
|order_customer_id|order_date           |order_id|order_status   |
+-----------------+---------------------+--------+---------------+
|11599            |2013-07-25 00:00:00.0|1       |CLOSED         |
|256              |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111            |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827             |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318            |2013-07-25 00:00:00.0|5       |COMPLETE       |
+-----------------+---------------------+--------+---------------+
only showing top 5 rows

+----------+
|order_date|
+----------+
|  20130725|
|  20130725|
|  20130725|
|  20130725|
|  20130725|
+----------+
only showing top 5 rows



In [0]:
# But in SQL expressions we would use the name:

orders.createOrReplaceTempView('orders')
spark.sql("""
    SELECT o.*, date_convert(order_date) AS order_date_as_int
    FROM orders AS o
""").show(n=5, truncate=False)

+-----------------+---------------------+--------+---------------+-----------------+
|order_customer_id|order_date           |order_id|order_status   |order_date_as_int|
+-----------------+---------------------+--------+---------------+-----------------+
|11599            |2013-07-25 00:00:00.0|1       |CLOSED         |20130725         |
|256              |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|20130725         |
|12111            |2013-07-25 00:00:00.0|3       |COMPLETE       |20130725         |
|8827             |2013-07-25 00:00:00.0|4       |CLOSED         |20130725         |
|11318            |2013-07-25 00:00:00.0|5       |COMPLETE       |20130725         |
+-----------------+---------------------+--------+---------------+-----------------+
only showing top 5 rows

