In [0]:
orders = spark.read.csv(
    '/public/retail_db/orders',
    schema='order_id INT, order_date STRING, order_customer_id INT, order_status STRING'
)
df = orders.select('order_id').filter('order_id = 1')
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]
datetimesDF = spark.createDataFrame(datetimes, schema="date STRING, time STRING")
from pyspark.sql.functions import *

In [0]:
# current_timestamp(), current_date()

df.select(current_timestamp()).show(truncate=False)
df.select(current_date()).show(truncate=False)

In [0]:
# to_date(), to_timestamp() - converts strings to dates/timestamps
# Very convienient. We can provide strings in almost any format.

df.select(to_date(lit('20250418'), 'yyyyMMdd').alias('to_date')).show()
df.select(to_timestamp(lit('20250418: 1940'), 'yyyyMMdd: HHmm').alias('to_timestamp')).show()

In [0]:
# Arithmetic functions

datetimesDF \
    .withColumn("date_add_date", date_add("date", 10)) \
    .withColumn("date_sub_date", date_sub("date", 10)) \
    .withColumn("date_months_date", add_months("date", 10)) \
    .withColumn("datediff_date", datediff(current_date(), "date")) \
    .withColumn("months_between_date", months_between(current_date(), "date")) \
    .show()

In [0]:
# Trunc functions (getting beggining of month, year, hour, minute)
# trunc() - date as input and date as output
# date_trunc() - timestamp as input and timestamp as output, but you can also trunc by day and month etc.

datetimesDF \
    .withColumn("date_first_day_of_month", trunc("date", "MM")) \
    .withColumn("date_first_day_of_year", trunc("date", "yy")) \
    .withColumn("time_first_day_of_month", date_trunc("MM", "time")) \
    .withColumn("time_first_minute_of_hour", date_trunc("minute", "time")) \
    .show()

In [0]:
# Extract functions

df.select(
    current_date().alias('current_date'), 
    year(current_date()).alias('year'),
    month(current_date()).alias('month'),
    weekofyear(current_date()).alias('weekofyear'),
    dayofyear(current_date()).alias('dayofyear'),
    dayofmonth(current_date()).alias('dayofmonth'),
    dayofweek(current_date()).alias('dayofweek')
).show() #yyyy-MM-dd

In [0]:
# date_format()
# Convienient way to change format of dates

datetimesDF \
    .withColumn("date_ym", date_format("date", "yyyy_MM")) \
    .withColumn("time_ym", date_format("time", "yyyyMM")) \
    .withColumn("date_dt", date_format("date", "yyyyMMdd::HHmmss")) \
    .withColumn("date_ts", date_format("time", "yyyyMMdd HH\mm-ss")) \
    .show(truncate=False)