In [1]:
import findspark
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate() 
findspark.init()
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)



In [2]:
df1 = sqlContext.createDataFrame([("Zara Rehman","10-01-2020"),("Asma Rehman","17-03-2020")],["Name","Birthday"])
df1.show()

+-----------+----------+
|       Name|  Birthday|
+-----------+----------+
|Zara Rehman|10-01-2020|
|Asma Rehman|17-03-2020|
+-----------+----------+



In [3]:
# Convert string column to date
df1 = df1.withColumn('Birthday',to_date(df1.Birthday,"dd-MM-yyyy"))

In [4]:
# Calculate week number of year from date
df1 = df1.withColumn('week_of_year', weekofyear(df1.Birthday))

In [5]:
# Calculate week of month
df1 = df1.withColumn('week_of_month',date_format(df1.Birthday,"W"))

In [6]:
df1.show()

+-----------+----------+------------+-------------+
|       Name|  Birthday|week_of_year|week_of_month|
+-----------+----------+------------+-------------+
|Zara Rehman|2020-01-10|           2|            2|
|Asma Rehman|2020-03-17|          12|            3|
+-----------+----------+------------+-------------+



In [7]:
df = sqlContext.createDataFrame([("Micheal","1990-10-08 03:19:58", "1995-12-08 05:30:40"),("Andrew","1996-03-17 05:33:51","1996-04-17 05:33:51")],["Name","Birthday","datee"])
df.show()

+-------+-------------------+-------------------+
|   Name|           Birthday|              datee|
+-------+-------------------+-------------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|
+-------+-------------------+-------------------+



In [8]:
df.select('Birthday').printSchema()

root
 |-- Birthday: string (nullable = true)



In [9]:
# convert string to timestamp
df = df.withColumn('Birthday',to_timestamp(df.Birthday,'yyyy-MM-dd HH:mm:ss'))
df.show()

+-------+-------------------+-------------------+
|   Name|           Birthday|              datee|
+-------+-------------------+-------------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|
+-------+-------------------+-------------------+



In [10]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Birthday: timestamp (nullable = true)
 |-- datee: string (nullable = true)



In [11]:
df = df.withColumn('datee',to_timestamp(df.datee,'yyyy-MM-dd HH:mm:ss'))
df.show()

+-------+-------------------+-------------------+
|   Name|           Birthday|              datee|
+-------+-------------------+-------------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|
+-------+-------------------+-------------------+



In [12]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Birthday: timestamp (nullable = true)
 |-- datee: timestamp (nullable = true)



In [13]:
# Calculate difference between two timestamp in secs
df = df.withColumn('diff_secs', col('datee').cast("long") - col('Birthday').cast("long"))
df.show()

+-------+-------------------+-------------------+---------+
|   Name|           Birthday|              datee|diff_secs|
+-------+-------------------+-------------------+---------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|
+-------+-------------------+-------------------+---------+



In [14]:
# Calculate difference between two timestamp in mins
df = df.withColumn('diff_mins', col('diff_secs')/60)
df.show()

+-------+-------------------+-------------------+---------+---------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|
+-------+-------------------+-------------------+---------+---------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|
+-------+-------------------+-------------------+---------+---------+



In [15]:
# Calculate difference between two timestamp in hours
df = df.withColumn('diff_hours', col('diff_mins')/60)
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|
+-------+-------------------+-------------------+---------+---------+-----------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|
+-------+-------------------+-------------------+---------+---------+-----------------+



In [16]:
# diff in days between two dates
df = df.withColumn('diff_in_days',datediff(col('datee'),col('Birthday')))
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|diff_in_days|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|        1887|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|          31|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+



In [17]:
# diff in months between two dates
df = df.withColumn('diff_in_months',months_between(col('datee'),col('Birthday')))
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|diff_in_days|diff_in_months|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|        1887|          62.0|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|          31|           1.0|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+



In [18]:
df = df.withColumn('diff_in_quaters',months_between(col('datee'),col('Birthday'))/4)
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|diff_in_days|diff_in_months|diff_in_quaters|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|        1887|          62.0|           15.5|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|          31|           1.0|           0.25|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+



In [19]:
# diff in years between two dates
df = df.withColumn('diff_in_years',datediff(col('datee'),col('Birthday'))/365)
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|diff_in_days|diff_in_months|diff_in_quaters|      diff_in_years|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|        1887|          62.0|           15.5|   5.16986301369863|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|          31|           1.0|           0.25|0.08493150684931507|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+



In [20]:
# get current date
df = df.withColumn('current_Date', current_date())
df.show()

+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+------------+
|   Name|           Birthday|              datee|diff_secs|diff_mins|       diff_hours|diff_in_days|diff_in_months|diff_in_quaters|      diff_in_years|current_Date|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+------------+
|Micheal|1990-10-08 03:19:58|1995-12-08 05:30:40|163044642|2717410.7|45290.17833333334|        1887|          62.0|           15.5|   5.16986301369863|  2020-04-23|
| Andrew|1996-03-17 05:33:51|1996-04-17 05:33:51|  2678400|  44640.0|            744.0|          31|           1.0|           0.25|0.08493150684931507|  2020-04-23|
+-------+-------------------+-------------------+---------+---------+-----------------+------------+--------------+---------------+-------------------+------------+



In [21]:
# calculate day of week
df1 = df1.withColumn('Day_of_month', dayofmonth('Birthday'))
df1.show()

+-----------+----------+------------+-------------+------------+
|       Name|  Birthday|week_of_year|week_of_month|Day_of_month|
+-----------+----------+------------+-------------+------------+
|Zara Rehman|2020-01-10|           2|            2|          10|
|Asma Rehman|2020-03-17|          12|            3|          17|
+-----------+----------+------------+-------------+------------+



In [22]:
# Calculate day of year
df1 = df1.withColumn('Day_of_year', dayofyear('Birthday'))
df1.show()

+-----------+----------+------------+-------------+------------+-----------+
|       Name|  Birthday|week_of_year|week_of_month|Day_of_month|Day_of_year|
+-----------+----------+------------+-------------+------------+-----------+
|Zara Rehman|2020-01-10|           2|            2|          10|         10|
|Asma Rehman|2020-03-17|          12|            3|          17|         77|
+-----------+----------+------------+-------------+------------+-----------+



In [23]:
# calculate day of week (1-sunday, 2-monday...7-saturday)
df1 = df1.withColumn('Day_of_week', dayofweek('Birthday'))
df1.show()

+-----------+----------+------------+-------------+------------+-----------+-----------+
|       Name|  Birthday|week_of_year|week_of_month|Day_of_month|Day_of_year|Day_of_week|
+-----------+----------+------------+-------------+------------+-----------+-----------+
|Zara Rehman|2020-01-10|           2|            2|          10|         10|          6|
|Asma Rehman|2020-03-17|          12|            3|          17|         77|          3|
+-----------+----------+------------+-------------+------------+-----------+-----------+



In [24]:
df1 = df1.withColumn('week_day_name',date_format('Birthday',"EEEE"))
df1.show()           

+-----------+----------+------------+-------------+------------+-----------+-----------+-------------+
|       Name|  Birthday|week_of_year|week_of_month|Day_of_month|Day_of_year|Day_of_week|week_day_name|
+-----------+----------+------------+-------------+------------+-----------+-----------+-------------+
|Zara Rehman|2020-01-10|           2|            2|          10|         10|          6|       Friday|
|Asma Rehman|2020-03-17|          12|            3|          17|         77|          3|      Tuesday|
+-----------+----------+------------+-------------+------------+-----------+-----------+-------------+



In [25]:
df_date = df.select('birthday')

In [26]:
# Add hours minutes or seconds to timestamp
df_date.withColumn('time',df_date.birthday + expr('INTERVAL 3 HOURS')).show()
df_date.withColumn('time',df_date.birthday + expr('INTERVAL 01 minutes')).show()

+-------------------+-------------------+
|           birthday|               time|
+-------------------+-------------------+
|1990-10-08 03:19:58|1990-10-08 06:19:58|
|1996-03-17 05:33:51|1996-03-17 08:33:51|
+-------------------+-------------------+

+-------------------+-------------------+
|           birthday|               time|
+-------------------+-------------------+
|1990-10-08 03:19:58|1990-10-08 03:20:58|
|1996-03-17 05:33:51|1996-03-17 05:34:51|
+-------------------+-------------------+



In [27]:
df_date.withColumn('hour',hour(df_date.birthday)).show()
df_date.withColumn('minute',minute(df_date.birthday)).show()
df_date.withColumn('sec',second(df_date.birthday)).show()
df_date.withColumn('year',year(df_date.birthday)).show()
df_date.withColumn('month',month(df_date.birthday)).show()
df_date.withColumn('quarter',quarter(df_date.birthday)).show()

+-------------------+----+
|           birthday|hour|
+-------------------+----+
|1990-10-08 03:19:58|   3|
|1996-03-17 05:33:51|   5|
+-------------------+----+

+-------------------+------+
|           birthday|minute|
+-------------------+------+
|1990-10-08 03:19:58|    19|
|1996-03-17 05:33:51|    33|
+-------------------+------+

+-------------------+---+
|           birthday|sec|
+-------------------+---+
|1990-10-08 03:19:58| 58|
|1996-03-17 05:33:51| 51|
+-------------------+---+

+-------------------+----+
|           birthday|year|
+-------------------+----+
|1990-10-08 03:19:58|1990|
|1996-03-17 05:33:51|1996|
+-------------------+----+

+-------------------+-----+
|           birthday|month|
+-------------------+-----+
|1990-10-08 03:19:58|   10|
|1996-03-17 05:33:51|    3|
+-------------------+-----+

+-------------------+-------+
|           birthday|quarter|
+-------------------+-------+
|1990-10-08 03:19:58|      4|
|1996-03-17 05:33:51|      1|
+-------------------+--