In [0]:
orders = spark.read.csv(
    '/public/retail_db/orders',
    schema='order_id INT, order_date STRING, order_customer_id INT, order_status STRING'
)
df = orders.select('order_id').filter('order_id = 1')
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]
datetimesDF = spark.createDataFrame(datetimes, schema="date STRING, time STRING")
from pyspark.sql.functions import *

In [0]:
# current_timestamp(), current_date()

df.select(current_timestamp()).show(truncate=False)
df.select(current_date()).show(truncate=False)

+----------------------+
|current_timestamp()   |
+----------------------+
|2025-04-18 17:53:58.27|
+----------------------+

+--------------+
|current_date()|
+--------------+
|2025-04-18    |
+--------------+



In [0]:
# to_date(), to_timestamp() - converts strings to dates/timestamps
# Very convienient. We can provide strings in almost any format.

df.select(to_date(lit('20250418'), 'yyyyMMdd').alias('to_date')).show()
df.select(to_timestamp(lit('20250418: 1940'), 'yyyyMMdd: HHmm').alias('to_timestamp')).show()

+----------+
|   to_date|
+----------+
|2025-04-18|
+----------+

+-------------------+
|       to_timestamp|
+-------------------+
|2025-04-18 19:40:00|
+-------------------+



In [0]:
# Arithmetic functions

datetimesDF \
    .withColumn("date_add_date", date_add("date", 10)) \
    .withColumn("date_sub_date", date_sub("date", 10)) \
    .withColumn("date_months_date", add_months("date", 10)) \
    .withColumn("datediff_date", datediff(current_date(), "date")) \
    .withColumn("months_between_date", months_between(current_date(), "date")) \
    .show()

+----------+-------------+-------------+----------------+-------------+-------------------+
|      date|date_add_date|date_sub_date|date_months_date|datediff_date|months_between_date|
+----------+-------------+-------------+----------------+-------------+-------------------+
|2014-02-28|   2014-03-10|   2014-02-18|      2014-12-28|         4067|       133.67741935|
|2016-02-29|   2016-03-10|   2016-02-19|      2016-12-29|         3336|       109.64516129|
|2017-10-31|   2017-11-10|   2017-10-21|      2018-08-31|         2726|        89.58064516|
|2019-11-30|   2019-12-10|   2019-11-20|      2020-09-30|         1966|        64.61290323|
+----------+-------------+-------------+----------------+-------------+-------------------+



In [0]:
# Trunc functions (getting beggining of month, year, hour, minute)
# trunc() - date as input and date as output
# date_trunc() - timestamp as input and timestamp as output, but you can also trunc by day and month etc.

datetimesDF \
    .withColumn("date_first_day_of_month", trunc("date", "MM")) \
    .withColumn("date_first_day_of_year", trunc("date", "yy")) \
    .withColumn("time_first_day_of_month", date_trunc("MM", "time")) \
    .withColumn("time_first_minute_of_hour", date_trunc("minute", "time")) \
    .show()

+----------+--------------------+-----------------------+----------------------+-----------------------+-------------------------+
|      date|                time|date_first_day_of_month|date_first_day_of_year|time_first_day_of_month|time_first_minute_of_hour|
+----------+--------------------+-----------------------+----------------------+-----------------------+-------------------------+
|2014-02-28|2014-02-28 10:00:...|             2014-02-01|            2014-01-01|    2014-02-01 00:00:00|      2014-02-28 10:00:00|
|2016-02-29|2016-02-29 08:08:...|             2016-02-01|            2016-01-01|    2016-02-01 00:00:00|      2016-02-29 08:08:00|
|2017-10-31|2017-12-31 11:59:...|             2017-10-01|            2017-01-01|    2017-12-01 00:00:00|      2017-12-31 11:59:00|
|2019-11-30|2019-08-31 00:00:...|             2019-11-01|            2019-01-01|    2019-08-01 00:00:00|      2019-08-31 00:00:00|
+----------+--------------------+-----------------------+----------------------+---

In [0]:
# Extract functions

df.select(
    current_date().alias('current_date'), 
    year(current_date()).alias('year'),
    month(current_date()).alias('month'),
    weekofyear(current_date()).alias('weekofyear'),
    dayofyear(current_date()).alias('dayofyear'),
    dayofmonth(current_date()).alias('dayofmonth'),
    dayofweek(current_date()).alias('dayofweek')
).show() #yyyy-MM-dd

+------------+----+-----+----------+---------+----------+---------+
|current_date|year|month|weekofyear|dayofyear|dayofmonth|dayofweek|
+------------+----+-----+----------+---------+----------+---------+
|  2025-04-18|2025|    4|        16|      108|        18|        6|
+------------+----+-----+----------+---------+----------+---------+



In [0]:
# date_format()
# Convienient way to change format of dates

datetimesDF \
    .withColumn("date_ym", date_format("date", "yyyy_MM")) \
    .withColumn("time_ym", date_format("time", "yyyyMM")) \
    .withColumn("date_dt", date_format("date", "yyyyMMdd::HHmmss")) \
    .withColumn("date_ts", date_format("time", "yyyyMMdd HH\mm-ss")) \
    .show(truncate=False)

+----------+-----------------------+-------+-------+----------------+-----------------+
|date      |time                   |date_ym|time_ym|date_dt         |date_ts          |
+----------+-----------------------+-------+-------+----------------+-----------------+
|2014-02-28|2014-02-28 10:00:00.123|2014_02|201402 |20140228::000000|20140228 10\00-00|
|2016-02-29|2016-02-29 08:08:08.999|2016_02|201602 |20160229::000000|20160229 08\08-08|
|2017-10-31|2017-12-31 11:59:59.123|2017_10|201712 |20171031::000000|20171231 11\59-59|
|2019-11-30|2019-08-31 00:00:00.000|2019_11|201908 |20191130::000000|20190831 00\00-00|
+----------+-----------------------+-------+-------+----------------+-----------------+

