In [0]:
########## Run this code snippet when running for the first time and don't repeat it in future (else it will keep on downloading the same stuffs again and again and
########## result in redundant usage of memory)

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apachemirror.wuchna.com/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
import findspark
import pandas as pd
import numpy as np
os.environ["JAVA_HOME"]   = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]  = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark                      = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
pathname = '/content/date_sample.csv'
df = spark.read.csv(pathname,header=True,inferSchema=True)
df.show(3)

+----------+----------+-----+
|     date1|     date2|date3|
+----------+----------+-----+
|18-07-2032|19-02-2038|45323|
|25-05-2024|30-03-2039|45572|
|25-10-2030|14-07-2035|43492|
+----------+----------+-----+
only showing top 3 rows



**Current Date:-** Inserting a column having current date

In [0]:
df = df.withColumn('current_date',current_date())
df.show(3)

+----------+----------+-----+------------+
|     date1|     date2|date3|current_date|
+----------+----------+-----+------------+
|18-07-2032|19-02-2038|45323|  2020-01-15|
|25-05-2024|30-03-2039|45572|  2020-01-15|
|25-10-2030|14-07-2035|43492|  2020-01-15|
+----------+----------+-----+------------+
only showing top 3 rows



**Converting a string to date object:-** We can clearly see that the columns date1 and date2 are strings. We will convert them to DateType by using 'to_date' function below

In [0]:
df = df.withColumn('date1_date_type',to_date(unix_timestamp('date1', 'dd-MM-yyyy').cast('timestamp')))
df = df.withColumn('date2_date_type',to_date(unix_timestamp('date2', 'dd-MM-yyyy').cast('timestamp')))
df.show(3)
print(df.printSchema())

+----------+----------+-----+------------+---------------+---------------+
|     date1|     date2|date3|current_date|date1_date_type|date2_date_type|
+----------+----------+-----+------------+---------------+---------------+
|18-07-2032|19-02-2038|45323|  2020-01-15|     2032-07-18|     2038-02-19|
|25-05-2024|30-03-2039|45572|  2020-01-15|     2024-05-25|     2039-03-30|
|25-10-2030|14-07-2035|43492|  2020-01-15|     2030-10-25|     2035-07-14|
+----------+----------+-----+------------+---------------+---------------+
only showing top 3 rows

root
 |-- date1: string (nullable = true)
 |-- date2: string (nullable = true)
 |-- date3: integer (nullable = true)
 |-- current_date: date (nullable = false)
 |-- date1_date_type: date (nullable = true)
 |-- date2_date_type: date (nullable = true)

None


**Converting the format of a particular date:-** Here we can see how to convert a format of a particular date column

In [0]:
df = df.withColumn('date1_new_format',date_format('date1_date_type', 'dd/MMM/yyy').alias('date'))
df = df.withColumn('date2_new_format',date_format('date2_date_type', 'MMM dd,yyy').alias('date'))
df.show(3)

+----------+----------+-----+------------+---------------+---------------+----------------+----------------+
|     date1|     date2|date3|current_date|date1_date_type|date2_date_type|date1_new_format|date2_new_format|
+----------+----------+-----+------------+---------------+---------------+----------------+----------------+
|18-07-2032|19-02-2038|45323|  2020-01-15|     2032-07-18|     2038-02-19|     18/Jul/2032|     Feb 19,2038|
|25-05-2024|30-03-2039|45572|  2020-01-15|     2024-05-25|     2039-03-30|     25/May/2024|     Mar 30,2039|
|25-10-2030|14-07-2035|43492|  2020-01-15|     2030-10-25|     2035-07-14|     25/Oct/2030|     Jul 14,2035|
+----------+----------+-----+------------+---------------+---------------+----------------+----------------+
only showing top 3 rows



**Truncating a date:-** We will be truncating a date to nearest possible month, year and day. There are other truncation parameters available as well. They are :-  <br>
**format – ‘year’, ‘yyyy’, ‘yy’, ‘month’, ‘mon’, ‘mm’, ‘day’, ‘dd’, ‘hour’, ‘minute’, ‘second’, ‘week’, ‘quarter’**





In [0]:
df = df.withColumn('truncated_month',date_trunc("month", "date1_date_type"))
df = df.withColumn('truncated_year',date_trunc("year", "date1_date_type"))
df = df.withColumn('truncated_day',date_trunc("day", "date1_date_type"))
df.select('date1_date_type','truncated_month','truncated_year','truncated_day').show(4)

+---------------+-------------------+-------------------+-------------------+
|date1_date_type|    truncated_month|     truncated_year|      truncated_day|
+---------------+-------------------+-------------------+-------------------+
|     2032-07-18|2032-07-01 00:00:00|2032-01-01 00:00:00|2032-07-18 00:00:00|
|     2024-05-25|2024-05-01 00:00:00|2024-01-01 00:00:00|2024-05-25 00:00:00|
|     2030-10-25|2030-10-01 00:00:00|2030-01-01 00:00:00|2030-10-25 00:00:00|
|     2025-09-29|2025-09-01 00:00:00|2025-01-01 00:00:00|2025-09-29 00:00:00|
+---------------+-------------------+-------------------+-------------------+
only showing top 4 rows



**Adding days to a particular date:-** We can add days to a particular date by using 'date_add'

In [0]:
df = df.withColumn('70days_after_date1',date_add(df['date1_date_type'],70))
print('The date 70 days after the dates mentioned in date1_date_type is given by:-')
df.select('date1_date_type','70days_after_date1').show(3)

The date 70 days after the dates mentioned in date1_date_type is given by:-
+---------------+------------------+
|date1_date_type|70days_after_date1|
+---------------+------------------+
|     2032-07-18|        2032-09-26|
|     2024-05-25|        2024-08-03|
|     2030-10-25|        2031-01-03|
+---------------+------------------+
only showing top 3 rows



**Subtracting days from a particular date:-** We can find a date some days prior to a particular date using date_sub

In [0]:
df = df.withColumn('70days_before_date1',date_sub(df['date1_date_type'],70))
print('The date 70 days before the dates mentioned in date1_date_type is given by:-')
df.select('date1_date_type','70days_before_date1').show(3)

The date 70 days before the dates mentioned in date1_date_type is given by:-
+---------------+-------------------+
|date1_date_type|70days_before_date1|
+---------------+-------------------+
|     2032-07-18|         2032-05-09|
|     2024-05-25|         2024-03-16|
|     2030-10-25|         2030-08-16|
+---------------+-------------------+
only showing top 3 rows



**Calculating the days between two dates:-** We will use datediff to calculate the number of days between two dates

In [0]:
df = df.withColumn('datediff',datediff(df['date2_date_type'],df['date1_date_type']))
df.select('datediff','date2_date_type','date1_date_type').show(3)

+--------+---------------+---------------+
|datediff|date2_date_type|date1_date_type|
+--------+---------------+---------------+
|    2042|     2038-02-19|     2032-07-18|
|    5422|     2039-03-30|     2024-05-25|
|    1723|     2035-07-14|     2030-10-25|
+--------+---------------+---------------+
only showing top 3 rows



Extracting month, year and quarter from a particular date

In [0]:
df = df.withColumn('month_date1',month(df['date1_date_type']))
df = df.withColumn('year_date1',year(df['date1_date_type']))
df = df.withColumn('quarter_date1',quarter(df['date1_date_type']))
df.select('month_date1','year_date1','quarter_date1','date1_date_type').show(5)

+-----------+----------+-------------+---------------+
|month_date1|year_date1|quarter_date1|date1_date_type|
+-----------+----------+-------------+---------------+
|          7|      2032|            3|     2032-07-18|
|          5|      2024|            2|     2024-05-25|
|         10|      2030|            4|     2030-10-25|
|          9|      2025|            3|     2025-09-29|
|          2|      2030|            1|     2030-02-13|
+-----------+----------+-------------+---------------+
only showing top 5 rows



**Adding months to an existing date**

In [0]:
df = df.withColumn('add_5_months',add_months(df['date1_date_type'],5))
df.select('add_5_months','date1_date_type').show(5)

+------------+---------------+
|add_5_months|date1_date_type|
+------------+---------------+
|  2032-12-18|     2032-07-18|
|  2024-10-25|     2024-05-25|
|  2031-03-25|     2030-10-25|
|  2026-02-28|     2025-09-29|
|  2030-07-13|     2030-02-13|
+------------+---------------+
only showing top 5 rows



Months between two given dates

In [0]:
df = df.withColumn('months_between',months_between(df['date2_date_type'],df['date1_date_type']))
df.select('months_between','date1_date_type','date2_date_type').show(5)

+--------------+---------------+---------------+
|months_between|date1_date_type|date2_date_type|
+--------------+---------------+---------------+
|   67.03225806|     2032-07-18|     2038-02-19|
|  178.16129032|     2024-05-25|     2039-03-30|
|   56.64516129|     2030-10-25|     2035-07-14|
|  109.12903226|     2025-09-29|     2034-11-02|
|   87.38709677|     2030-02-13|     2037-05-25|
+--------------+---------------+---------------+
only showing top 5 rows



Computing the 
1. Day of month
2. Day of year
3. week of year
4. day of week

In [0]:
df = df.withColumn('day_of_year_dt1',dayofyear(df['date1_date_type']))
df = df.withColumn('week_of_year_dt1',weekofyear(df['date1_date_type']))
df = df.withColumn('day_of_week_dt1',dayofweek(df['date1_date_type']))
df = df.withColumn('day_of_month_dt1',dayofmonth(df['date1_date_type']))
df.select('date1_date_type','day_of_year_dt1','week_of_year_dt1','day_of_week_dt1','day_of_month_dt1').show(5)


+---------------+---------------+----------------+---------------+----------------+
|date1_date_type|day_of_year_dt1|week_of_year_dt1|day_of_week_dt1|day_of_month_dt1|
+---------------+---------------+----------------+---------------+----------------+
|     2032-07-18|            200|              29|              1|              18|
|     2024-05-25|            146|              21|              7|              25|
|     2030-10-25|            298|              43|              6|              25|
|     2025-09-29|            272|              40|              2|              29|
|     2030-02-13|             44|               7|              4|              13|
+---------------+---------------+----------------+---------------+----------------+
only showing top 5 rows



Computing the last day and next day of date. 
1. Here we will compute the last day of the month for the date given using last_day
2. We also compute the instance of the next day(monday, tuesday, wednesday,... , Sunday) from a given date using next_day

In [0]:
df = df.withColumn('last_day_dt2',last_day(df['date2_date_type']))
df = df.withColumn('next_day_dt2',next_day(df['date2_date_type'],'Tue'))
df.select('date2_date_type','last_day_dt2','next_day_dt2').show(5)

+---------------+------------+------------+
|date2_date_type|last_day_dt2|next_day_dt2|
+---------------+------------+------------+
|     2038-02-19|  2038-02-28|  2038-02-23|
|     2039-03-30|  2039-03-31|  2039-04-05|
|     2035-07-14|  2035-07-31|  2035-07-17|
|     2034-11-02|  2034-11-30|  2034-11-07|
|     2037-05-25|  2037-05-31|  2037-05-26|
+---------------+------------+------------+
only showing top 5 rows



**Computing a current timestamp column:-**

In [0]:
df = df.withColumn('Current_ts', current_timestamp())
df.select('Current_ts').show(4,truncate=False)

+-----------------------+
|Current_ts             |
+-----------------------+
|2020-01-15 20:05:35.352|
|2020-01-15 20:05:35.352|
|2020-01-15 20:05:35.352|
|2020-01-15 20:05:35.352|
+-----------------------+
only showing top 4 rows



We calculate the time stamp values of a particular timestamp provided from (1970-01-01 00:00:00 UTC) 

In [0]:
Dataset1 = Row("timestamps_int")
data1   = [Dataset1(2028476499),
            Dataset1(1728476470),
            Dataset1(3223446490),
            Dataset1(1284964980)]
df11 = spark.createDataFrame(data1)

In [0]:
df11 = df11.withColumn('Timestamp_values',from_unixtime('timestamps_int'))
df11.show()

+--------------+-------------------+
|timestamps_int|   Timestamp_values|
+--------------+-------------------+
|    2028476499|2034-04-12 17:41:39|
|    1728476470|2024-10-09 12:21:10|
|    3223446490|2072-02-23 09:48:10|
|    1284964980|2010-09-20 06:43:00|
+--------------+-------------------+



Converting a particular date to unix time stamps and retrieving back the original dates and times

In [0]:
df = df.withColumn('unix_time_stamps',unix_timestamp('date2_date_type'))
df.select('unix_time_stamps','date2_date_type').show(5)
print('We can convert the unix time stamp values to date and times in following ways')
df = df.withColumn('Timestamp_values',to_timestamp('unix_time_stamps'))
df.select('unix_time_stamps','Timestamp_values','date2_date_type').show(5)

+----------------+---------------+
|unix_time_stamps|date2_date_type|
+----------------+---------------+
|      2150150400|     2038-02-19|
|      2185056000|     2039-03-30|
|      2067984000|     2035-07-14|
|      2046038400|     2034-11-02|
|      2126822400|     2037-05-25|
+----------------+---------------+
only showing top 5 rows

We can convert the unix time stamp values to date and times in following ways
+----------------+-------------------+---------------+
|unix_time_stamps|   Timestamp_values|date2_date_type|
+----------------+-------------------+---------------+
|      2150150400|2038-02-19 00:00:00|     2038-02-19|
|      2185056000|2039-03-30 00:00:00|     2039-03-30|
|      2067984000|2035-07-14 00:00:00|     2035-07-14|
|      2046038400|2034-11-02 00:00:00|     2034-11-02|
|      2126822400|2037-05-25 00:00:00|     2037-05-25|
+----------------+-------------------+---------------+
only showing top 5 rows



Extracting seconds, hours and minutes from a timestamp data. The column to use must be a timestamp column

In [0]:
df11 = df11.withColumn('Seconds',second(df11['Timestamp_values']))
df11 = df11.withColumn('Hours',hour(df11['Timestamp_values']))
df11 = df11.withColumn('Minutes',minute(df11['Timestamp_values']))
df11.show()

+--------------+-------------------+-------+-----+-------+
|timestamps_int|   Timestamp_values|Seconds|Hours|Minutes|
+--------------+-------------------+-------+-----+-------+
|    2028476499|2034-04-12 17:41:39|     39|   17|     41|
|    1728476470|2024-10-09 12:21:10|     10|   12|     21|
|    3223446490|2072-02-23 09:48:10|     10|    9|     48|
|    1284964980|2010-09-20 06:43:00|      0|    6|     43|
+--------------+-------------------+-------+-----+-------+

