In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.range(2)

df.show()

+---+
| id|
+---+
|  0|
|  1|
+---+



In [3]:
#defaulut format of datetype yyyy-mm-dd
from pyspark.sql.functions import current_date

df1 = df.withColumn('CurrentDate',current_date())

df1.show()

df1.printSchema()

+---+-----------+
| id|CurrentDate|
+---+-----------+
|  0| 2023-03-17|
|  1| 2023-03-17|
+---+-----------+

root
 |-- id: long (nullable = false)
 |-- CurrentDate: date (nullable = false)



In [5]:
from pyspark.sql.functions import date_format

df2 = df1.withColumn('CurrentDate', date_format(df1.CurrentDate, 'yyyy.mm.dd') )

df2.show()

df2.printSchema()

+---+-----------+
| id|CurrentDate|
+---+-----------+
|  0| 2023.00.17|
|  1| 2023.00.17|
+---+-----------+

root
 |-- id: long (nullable = false)
 |-- CurrentDate: string (nullable = false)



In [6]:
from pyspark.sql.functions import to_date

df3 = df2.withColumn('todate', to_date(df2.CurrentDate,'yyyy.mm.dd'))

df3.show()

df3.printSchema()

+---+-----------+----------+
| id|CurrentDate|    todate|
+---+-----------+----------+
|  0| 2023.00.17|2023-01-17|
|  1| 2023.00.17|2023-01-17|
+---+-----------+----------+

root
 |-- id: long (nullable = false)
 |-- CurrentDate: string (nullable = false)
 |-- todate: date (nullable = true)



In [8]:
df4= df3.select('*',to_date(df3.CurrentDate,'yyyy.mm.dd').alias('CurrentDataDateType'))

df4.show()

df4.printSchema()

+---+-----------+----------+-------------------+
| id|CurrentDate|    todate|CurrentDataDateType|
+---+-----------+----------+-------------------+
|  0| 2023.00.17|2023-01-17|         2023-01-17|
|  1| 2023.00.17|2023-01-17|         2023-01-17|
+---+-----------+----------+-------------------+

root
 |-- id: long (nullable = false)
 |-- CurrentDate: string (nullable = false)
 |-- todate: date (nullable = true)
 |-- CurrentDataDateType: date (nullable = true)



In [24]:
from pyspark.sql.functions import datediff

data = [('2015-06-14',),('2022-05-07',),('2023-09-05',),]

schema = ['d1']

df = spark.createDataFrame(data=data,schema=schema)

df.show()

df.printSchema()

+----------+
|        d1|
+----------+
|2015-06-14|
|2022-05-07|
|2023-09-05|
+----------+

root
 |-- d1: string (nullable = true)



In [25]:
from pyspark.sql.functions import datediff

data = [('2015-06-14','2022-05-07','2023-09-05'),]

schema = ['d1','d2','d3']

df = spark.createDataFrame(data=data,schema=schema)

df.show()

df.printSchema()

+----------+----------+----------+
|        d1|        d2|        d3|
+----------+----------+----------+
|2015-06-14|2022-05-07|2023-09-05|
+----------+----------+----------+

root
 |-- d1: string (nullable = true)
 |-- d2: string (nullable = true)
 |-- d3: string (nullable = true)



In [27]:
df.withColumn('datediff',datediff(df.d2,df.d1)).show()

+----------+----------+----------+--------+
|        d1|        d2|        d3|datediff|
+----------+----------+----------+--------+
|2015-06-14|2022-05-07|2023-09-05|    2519|
+----------+----------+----------+--------+



In [28]:
from pyspark.sql.functions import months_between

df.withColumn('monthsbetween',months_between(df.d2,df.d1)).show()

+----------+----------+----------+-------------+
|        d1|        d2|        d3|monthsbetween|
+----------+----------+----------+-------------+
|2015-06-14|2022-05-07|2023-09-05|  82.77419355|
+----------+----------+----------+-------------+



In [31]:
from pyspark.sql.functions import add_months

df.withColumn('addmonths',add_months(df.d2, 5)).show()

df.withColumn('submonths',add_months(df.d2, -3)).show()

+----------+----------+----------+----------+
|        d1|        d2|        d3| addmonths|
+----------+----------+----------+----------+
|2015-06-14|2022-05-07|2023-09-05|2022-10-07|
+----------+----------+----------+----------+

+----------+----------+----------+----------+
|        d1|        d2|        d3| submonths|
+----------+----------+----------+----------+
|2015-06-14|2022-05-07|2023-09-05|2022-02-07|
+----------+----------+----------+----------+



In [33]:
from pyspark.sql.functions import date_add

df.withColumn('daysadd',date_add(df.d1,3)).show()

df.withColumn('subdays',date_add(df.d1, 5)).show()

+----------+----------+----------+----------+
|        d1|        d2|        d3|   daysadd|
+----------+----------+----------+----------+
|2015-06-14|2022-05-07|2023-09-05|2015-06-17|
+----------+----------+----------+----------+

+----------+----------+----------+----------+
|        d1|        d2|        d3|   subdays|
+----------+----------+----------+----------+
|2015-06-14|2022-05-07|2023-09-05|2015-06-19|
+----------+----------+----------+----------+



In [34]:
from pyspark.sql.functions import year,month

df.withColumn('year',year(df.d1)) \
   .withColumn('month',month(df.d1)).show()

+----------+----------+----------+----+-----+
|        d1|        d2|        d3|year|month|
+----------+----------+----------+----+-----+
|2015-06-14|2022-05-07|2023-09-05|2015|    6|
+----------+----------+----------+----+-----+

