In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Date_functions").getOrCreate()

In [3]:
df = spark.range(2)
df.show()

+---+
| id|
+---+
|  0|
|  1|
+---+



In [5]:
from pyspark.sql.functions import current_date,date_format

df1 = df.withColumn("current_date", current_date())
df1.show()
df1.printSchema()

+---+------------+
| id|current_date|
+---+------------+
|  0|  2024-08-13|
|  1|  2024-08-13|
+---+------------+

root
 |-- id: long (nullable = false)
 |-- current_date: date (nullable = false)



In [8]:
df2 = df1.withColumn('current_date_string',date_format(df1.current_date,'yyyy.MM.dd'))
df2.show()
df2.printSchema()

+---+------------+-------------------+
| id|current_date|current_date_string|
+---+------------+-------------------+
|  0|  2024-08-13|         2024.08.13|
|  1|  2024-08-13|         2024.08.13|
+---+------------+-------------------+

root
 |-- id: long (nullable = false)
 |-- current_date: date (nullable = false)
 |-- current_date_string: string (nullable = false)



In [17]:
from pyspark.sql.functions import current_date,date_format,to_date

df3 = df2.withColumn('cur_date',to_date(df2.current_date_string,'yyyy.MM.dd'))
df3.show()
df3.printSchema()



+---+------------+-------------------+----------+
| id|current_date|current_date_string|  cur_date|
+---+------------+-------------------+----------+
|  0|  2024-08-13|         2024.08.13|2024-08-13|
|  1|  2024-08-13|         2024.08.13|2024-08-13|
+---+------------+-------------------+----------+

root
 |-- id: long (nullable = false)
 |-- current_date: date (nullable = false)
 |-- current_date_string: string (nullable = false)
 |-- cur_date: date (nullable = true)



In [33]:
from pyspark.sql.functions import datediff,months_between,add_months,date_add


In [24]:
data = [('2024-08-23','2024-09-23')]
schema = ['d1','d2']
df = spark.createDataFrame(data,schema)
df.show()

+----------+----------+
|        d1|        d2|
+----------+----------+
|2024-08-23|2024-09-23|
+----------+----------+



In [27]:
df1 = df.withColumn('datediff', datediff(df.d2,df.d1))
df1.show()

df1.printSchema()

+----------+----------+--------+
|        d1|        d2|datediff|
+----------+----------+--------+
|2024-08-23|2024-09-23|      31|
+----------+----------+--------+

root
 |-- d1: string (nullable = true)
 |-- d2: string (nullable = true)
 |-- datediff: integer (nullable = true)



In [29]:
df2 = df1.withColumn('months_between', months_between(df.d2,df.d1))
df2.show()

df2.printSchema()

+----------+----------+--------+--------------+
|        d1|        d2|datediff|months_between|
+----------+----------+--------+--------------+
|2024-08-23|2024-09-23|      31|           1.0|
+----------+----------+--------+--------------+

root
 |-- d1: string (nullable = true)
 |-- d2: string (nullable = true)
 |-- datediff: integer (nullable = true)
 |-- months_between: double (nullable = true)



In [32]:
df3 = df2.withColumn('add_months', add_months(df.d2,3))
df4 = df3.withColumn('sub_months', add_months(df.d2,-6))
df3.show()
df4.show()



+----------+----------+--------+--------------+----------+
|        d1|        d2|datediff|months_between|add_months|
+----------+----------+--------+--------------+----------+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|
+----------+----------+--------+--------------+----------+

+----------+----------+--------+--------------+----------+----------+
|        d1|        d2|datediff|months_between|add_months|sub_months|
+----------+----------+--------+--------------+----------+----------+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|2024-03-23|
+----------+----------+--------+--------------+----------+----------+



In [34]:
df5 = df4.withColumn('days_add', date_add(df.d2,6)).show()
df5 = df4.withColumn('days_add', date_add(df.d2,-6)).show()


+----------+----------+--------+--------------+----------+----------+----------+
|        d1|        d2|datediff|months_between|add_months|sub_months|  days_add|
+----------+----------+--------+--------------+----------+----------+----------+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|2024-03-23|2024-09-29|
+----------+----------+--------+--------------+----------+----------+----------+

+----------+----------+--------+--------------+----------+----------+----------+
|        d1|        d2|datediff|months_between|add_months|sub_months|  days_add|
+----------+----------+--------+--------------+----------+----------+----------+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|2024-03-23|2024-09-17|
+----------+----------+--------+--------------+----------+----------+----------+



In [35]:
from pyspark.sql.functions import datediff,months_between,add_months,date_add,year,month

df5 = df4.withColumn('year', year(df.d2)).show()
df5 = df4.withColumn('month', month(df.d2)).show()


+----------+----------+--------+--------------+----------+----------+----+
|        d1|        d2|datediff|months_between|add_months|sub_months|year|
+----------+----------+--------+--------------+----------+----------+----+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|2024-03-23|2024|
+----------+----------+--------+--------------+----------+----------+----+

+----------+----------+--------+--------------+----------+----------+-----+
|        d1|        d2|datediff|months_between|add_months|sub_months|month|
+----------+----------+--------+--------------+----------+----------+-----+
|2024-08-23|2024-09-23|      31|           1.0|2024-12-23|2024-03-23|    9|
+----------+----------+--------+--------------+----------+----------+-----+



In [5]:
from pyspark.sql.functions import to_timestamp,lit,col

# Convert timestamp string to actual timestamp with correct format
df_with_timestamp = df.withColumn(
    'convert_timestamp', 
    to_timestamp(lit('2024.08.13 18.03.23.132'), 'yyyy.MM.dd HH.mm.ss.SSS')
)

# Show the result
df_with_timestamp.show(truncate=False)


+---+-----------------------+
|id |convert_timestamp      |
+---+-----------------------+
|0  |2024-08-13 18:03:23.132|
|1  |2024-08-13 18:03:23.132|
+---+-----------------------+



In [7]:
from pyspark.sql.functions import hour,minute,second
df_final = df_with_timestamp.select('*',hour(df_with_timestamp.convert_timestamp).alias('hour')
                                       ,minute(df_with_timestamp.convert_timestamp).alias('minute')
                                       ,second(df_with_timestamp.convert_timestamp).alias('second')
                                       )
df_final.show(truncate=False)

+---+-----------------------+----+------+------+
|id |convert_timestamp      |hour|minute|second|
+---+-----------------------+----+------+------+
|0  |2024-08-13 18:03:23.132|18  |3     |23    |
|1  |2024-08-13 18:03:23.132|18  |3     |23    |
+---+-----------------------+----+------+------+

