In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")


In [0]:
list_data=[
           ["2022/03/31 01:55 AM"],
           ["2022/03/30 01:15 AM"],
           ["2022/03/29 02:15 PM"],
           ["2022/04/01 04:15 PM"],
          ]
list_schema=["inp_col"]

#Create DataFrame from the list 
df1=spark.createDataFrame(list_data,list_schema)

df1.printSchema()

df1.show()
df1.printSchema()

root
 |-- inp_col: string (nullable = true)

+-------------------+
|            inp_col|
+-------------------+
|2022/03/31 01:55 AM|
|2022/03/30 01:15 AM|
|2022/03/29 02:15 PM|
|2022/04/01 04:15 PM|
+-------------------+

root
 |-- inp_col: string (nullable = true)



In [0]:
df2=df1.withColumn("date_p",to_date("inp_col","yyyy/MM/dd"))\
.withColumn("time",to_timestamp("inp_col","yyyy/MM/dd hh:mm a"))
df2.printSchema()

root
 |-- inp_col: string (nullable = true)
 |-- date_p: date (nullable = true)
 |-- time: timestamp (nullable = true)



In [0]:
#Get Month,Year,Day,Hour,Min,Quarter,WeekofYear
df2=df2.withColumn("Year_P",year("time"))\
   .withColumn("Month_P",month("time"))\
    .withColumn("day",dayofmonth("time"))\
    .withColumn("hour_p",hour("time"))\
    .withColumn("Minute_P",minute("time"))\
    .withColumn("Quarter_P",quarter("time"))\
    .withColumn("week-of-year",weekofyear("time"))


In [0]:
#Get Month name ,Dayname in pyspark
df3=df2.withColumn("DAY_OF_WEEK",dayofweek("time"))\
.withColumn("DAYNAME",date_format("time","EEEE"))\
    .withColumn("MONTH_NAME",date_format("time","LLLL"))
df3.show()

+-------------------+----------+-------------------+------+-------+---+------+--------+---------+------------+-----------+---------+----------+
|            inp_col|    date_p|               time|Year_P|Month_P|day|hour_p|Minute_P|Quarter_P|week-of-year|DAY_OF_WEEK|  DAYNAME|MONTH_NAME|
+-------------------+----------+-------------------+------+-------+---+------+--------+---------+------------+-----------+---------+----------+
|2022/03/31 01:55 AM|2022-03-31|2022-03-31 01:55:00|  2022|      3| 31|     1|      55|        1|          13|          5| Thursday|     March|
|2022/03/30 01:15 AM|2022-03-30|2022-03-30 01:15:00|  2022|      3| 30|     1|      15|        1|          13|          4|Wednesday|     March|
|2022/03/29 02:15 PM|2022-03-29|2022-03-29 14:15:00|  2022|      3| 29|    14|      15|        1|          13|          3|  Tuesday|     March|
|2022/04/01 04:15 PM|2022-04-01|2022-04-01 16:15:00|  2022|      4|  1|    16|      15|        2|          13|          6|   Friday|    

In [0]:

#Get Todays date
df4=df3.withColumn("cur_date",current_date()) 

#Get Date difference
df4=df4.withColumn("datedif",datediff("date_p","cur_date"))

#Add N days to date
df4=df3.withColumn("dateadd",date_add("date_p",5))

#Add N month to date
df4=df3.withColumn("monthadd",add_months("date_p",2))

# Add 2 quarters (6 months) to a date
df4=df3.withColumn("quartedadd",add_months("date_p",2*3))

#Subtract N days to date
df4=df3.withColumn("datesub",date_sub("date_p",5))
# Subtract 3 years from a date
df4=df3.withColumn("Substrsct_Year",date_add("date_p",-365*3))

#date truncate
df4=df3.withColumn("datetrnc",date_trunc('mm',"time"))

#date_trunc to year
# Truncate timestamp to year
df4=df3.withColumn("datetrunc_year",date_trunc("year","date_p"))

#truncate to week,quarter
df4=df3.withColumn("datetrunc_quarter",date_trunc("quarter","date_p"))

#truncate to week,quarter
df4=df3.withColumn("datetrunc_week",date_trunc("week","time"))

   
df4.show(truncate=0)

+-------------------+----------+-------------------+------+-------+---+------+--------+---------+------------+-----------+---------+----------+-------------------+
|inp_col            |date_p    |time               |Year_P|Month_P|day|hour_p|Minute_P|Quarter_P|week-of-year|DAY_OF_WEEK|DAYNAME  |MONTH_NAME|datetrunc_week     |
+-------------------+----------+-------------------+------+-------+---+------+--------+---------+------------+-----------+---------+----------+-------------------+
|2022/03/31 01:55 AM|2022-03-31|2022-03-31 01:55:00|2022  |3      |31 |1     |55      |1        |13          |5          |Thursday |March     |2022-03-28 00:00:00|
|2022/03/30 01:15 AM|2022-03-30|2022-03-30 01:15:00|2022  |3      |30 |1     |15      |1        |13          |4          |Wednesday|March     |2022-03-28 00:00:00|
|2022/03/29 02:15 PM|2022-03-29|2022-03-29 14:15:00|2022  |3      |29 |14    |15      |1        |13          |3          |Tuesday  |March     |2022-03-28 00:00:00|
|2022/04/01 04:1

In [0]:
df4.printSchema()

root
 |-- inp_col: string (nullable = true)
 |-- date_p: date (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- Year_P: integer (nullable = true)
 |-- Month_P: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour_p: integer (nullable = true)
 |-- Minute_P: integer (nullable = true)
 |-- Quarter_P: integer (nullable = true)
 |-- week-of-year: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- DAYNAME: string (nullable = true)
 |-- MONTH_NAME: string (nullable = true)
 |-- datetrunc_week: timestamp (nullable = true)



In [0]:
# Import pandas module
import pandas as pd

# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)


In [0]:
data_unix=[
    ("R20165",20230111,1),
    ("R20166",20231012,1),
    ("R20167",20230413,1),
    ("R20168",20230314,1),
    ("R20169",20230215,1)]
schema1=["RechargeId","RechargeDate","Validity"]
df_unix=spark.createDataFrame(data_unix,schema1)
display(df_unix)
display(df_unix.printSchema())

RechargeId,RechargeDate,Validity
R20165,20230111,1
R20166,20231012,1
R20167,20230413,1
R20168,20230314,1
R20169,20230215,1


root
 |-- RechargeId: string (nullable = true)
 |-- RechargeDate: long (nullable = true)
 |-- Validity: long (nullable = true)



In [0]:
var_hold=to_date(col("RechargeDate").cast("string"),"yyyyMMdd")
df_unix=df_unix.withColumn("Recharge_Date_Unix",var_hold)
df_unix=df_unix.select("RechargeId","RechargeDate","Recharge_Date_Unix",col("Validity").cast("integer"))

from pyspark.sql.functions import col

# Assuming 'column_name' is the name of the column you want to convert
#df = df.withColumn("column_name", col("column_name").cast("string"))
## Assuming 'column_name' is the name of the column you want to convert
#df = df.withColumn("column_name", col("column_name").cast("long"))





In [0]:
df_unix=df_unix.withColumn("Validity_end",date_add("Recharge_Date_Unix","Validity"))

In [0]:
df_unix.printSchema()

root
 |-- RechargeId: string (nullable = true)
 |-- RechargeDate: long (nullable = true)
 |-- Recharge_Date_Unix: date (nullable = true)
 |-- Validity: integer (nullable = true)
 |-- Validity_end: date (nullable = true)



In [0]:
df_unix.show()

+----------+------------+------------------+--------+------------+
|RechargeId|RechargeDate|Recharge_Date_Unix|Validity|Validity_end|
+----------+------------+------------------+--------+------------+
|    R20165|    20230111|        2023-01-11|       1|  2023-01-12|
|    R20166|    20231012|        2023-10-12|       1|  2023-10-13|
|    R20167|    20230413|        2023-04-13|       1|  2023-04-14|
|    R20168|    20230314|        2023-03-14|       1|  2023-03-15|
|    R20169|    20230215|        2023-02-15|       1|  2023-02-16|
+----------+------------+------------------+--------+------------+

