In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import collections
from pyspark.sql.functions import col

In [2]:
df_list = [('2021-01-01','Test1','2021-01-01','B'),('2021-04-01','Test2','2021-03-31','B'),('2021-03-01','Test3','2021-02-28','F'),
           ('2021-02-01','Test4','2021-29-01','N'),('2021-01-01','Test5','2021-01-01','F'),('2021-02-01','Test6','2021-01-31','B')]
cols = ['NUDGE_END','NUDGE_ID','AS_OF_DT','NUDGE_TYPE']

In [3]:
spark = SparkSession.builder.appName("DF-Ops").getOrCreate()

In [4]:
data = spark.createDataFrame(df_list,cols)

In [5]:
data.printSchema()

root
 |-- NUDGE_END: string (nullable = true)
 |-- NUDGE_ID: string (nullable = true)
 |-- AS_OF_DT: string (nullable = true)
 |-- NUDGE_TYPE: string (nullable = true)



In [6]:
df = data.withColumn("NUDGE_END", (col("NUDGE_END").cast("date"))).withColumn("AS_OF_DT", (col("AS_OF_DT").cast("date"))).where(col('NUDGE_END') == col('AS_OF_DT') + 1)

In [7]:
df.show()

+----------+--------+----------+----------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|
+----------+--------+----------+----------+
|2021-04-01|   Test2|2021-03-31|         B|
|2021-03-01|   Test3|2021-02-28|         F|
|2021-02-01|   Test6|2021-01-31|         B|
+----------+--------+----------+----------+



In [8]:
final = df.where(col('NUDGE_TYPE')!='F')
final.show()

+----------+--------+----------+----------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|
+----------+--------+----------+----------+
|2021-04-01|   Test2|2021-03-31|         B|
|2021-02-01|   Test6|2021-01-31|         B|
+----------+--------+----------+----------+



In [9]:
final.count()

2

In [10]:
final1 = final.where(col("AS_OF_DT") == '2021-03-31')
final1.show()

+----------+--------+----------+----------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|
+----------+--------+----------+----------+
|2021-04-01|   Test2|2021-03-31|         B|
+----------+--------+----------+----------+



In [11]:
df1_list = [('2021-01-01','Test1','2021-01-01','B','2021-01-01 11:00:00'),('2021-04-01','Test3','2021-03-31','B','2021-01-01 11:00:00'),('2021-03-01','Test4','2021-02-28','F','2021-01-01 11:00:00'),
           ('2021-02-01','Test1','2021-29-01','N','2021-01-01 11:00:00'),('2021-01-01','Test4','2021-01-01','F','2021-01-01 11:00:00'),('2021-02-01','Test6','2021-01-31','B','2021-01-01 11:00:00')]
cols1 = ['NUDGE_END','NUDGE_ID','AS_OF_DT','NUDGE_TYPE','LST_ACT_BY_TM']
fdbk = spark.createDataFrame(df1_list,cols1)
fdbk.show()

+----------+--------+----------+----------+-------------------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|      LST_ACT_BY_TM|
+----------+--------+----------+----------+-------------------+
|2021-01-01|   Test1|2021-01-01|         B|2021-01-01 11:00:00|
|2021-04-01|   Test3|2021-03-31|         B|2021-01-01 11:00:00|
|2021-03-01|   Test4|2021-02-28|         F|2021-01-01 11:00:00|
|2021-02-01|   Test1|2021-29-01|         N|2021-01-01 11:00:00|
|2021-01-01|   Test4|2021-01-01|         F|2021-01-01 11:00:00|
|2021-02-01|   Test6|2021-01-31|         B|2021-01-01 11:00:00|
+----------+--------+----------+----------+-------------------+



In [12]:
fdbk = fdbk.withColumn("LST_ACT_BY_DT", (col("LST_ACT_BY_TM").cast("date")))
fdbk.show()

+----------+--------+----------+----------+-------------------+-------------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|      LST_ACT_BY_TM|LST_ACT_BY_DT|
+----------+--------+----------+----------+-------------------+-------------+
|2021-01-01|   Test1|2021-01-01|         B|2021-01-01 11:00:00|   2021-01-01|
|2021-04-01|   Test3|2021-03-31|         B|2021-01-01 11:00:00|   2021-01-01|
|2021-03-01|   Test4|2021-02-28|         F|2021-01-01 11:00:00|   2021-01-01|
|2021-02-01|   Test1|2021-29-01|         N|2021-01-01 11:00:00|   2021-01-01|
|2021-01-01|   Test4|2021-01-01|         F|2021-01-01 11:00:00|   2021-01-01|
|2021-02-01|   Test6|2021-01-31|         B|2021-01-01 11:00:00|   2021-01-01|
+----------+--------+----------+----------+-------------------+-------------+



In [13]:
result = data.join(fdbk,data['NUDGE_ID']==fdbk['NUDGE_ID'],'left')
result.show()

+----------+--------+----------+----------+----------+--------+----------+----------+-------------------+-------------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|      LST_ACT_BY_TM|LST_ACT_BY_DT|
+----------+--------+----------+----------+----------+--------+----------+----------+-------------------+-------------+
|2021-04-01|   Test2|2021-03-31|         B|      null|    null|      null|      null|               null|         null|
|2021-02-01|   Test6|2021-01-31|         B|2021-02-01|   Test6|2021-01-31|         B|2021-01-01 11:00:00|   2021-01-01|
|2021-01-01|   Test5|2021-01-01|         F|      null|    null|      null|      null|               null|         null|
|2021-01-01|   Test1|2021-01-01|         B|2021-01-01|   Test1|2021-01-01|         B|2021-01-01 11:00:00|   2021-01-01|
|2021-01-01|   Test1|2021-01-01|         B|2021-02-01|   Test1|2021-29-01|         N|2021-01-01 11:00:00|   2021-01-01|
|2021-03-01|   Test3|2021-02-28|        

In [14]:
result1 = result.where(result['LST_ACT_BY_DT'].isNull())
result1.show()

+----------+--------+----------+----------+---------+--------+--------+----------+-------------+-------------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|NUDGE_END|NUDGE_ID|AS_OF_DT|NUDGE_TYPE|LST_ACT_BY_TM|LST_ACT_BY_DT|
+----------+--------+----------+----------+---------+--------+--------+----------+-------------+-------------+
|2021-04-01|   Test2|2021-03-31|         B|     null|    null|    null|      null|         null|         null|
|2021-01-01|   Test5|2021-01-01|         F|     null|    null|    null|      null|         null|         null|
+----------+--------+----------+----------+---------+--------+--------+----------+-------------+-------------+



In [15]:
from pyspark.sql.functions import lit
datamodified = data.withColumn('BUSINESSDATE',lit('2021-01-01')).withColumn('COUNTRY',lit('SG'))
datamodified.show()

+----------+--------+----------+----------+------------+-------+
| NUDGE_END|NUDGE_ID|  AS_OF_DT|NUDGE_TYPE|BUSINESSDATE|COUNTRY|
+----------+--------+----------+----------+------------+-------+
|2021-01-01|   Test1|2021-01-01|         B|  2021-01-01|     SG|
|2021-04-01|   Test2|2021-03-31|         B|  2021-01-01|     SG|
|2021-03-01|   Test3|2021-02-28|         F|  2021-01-01|     SG|
|2021-02-01|   Test4|2021-29-01|         N|  2021-01-01|     SG|
|2021-01-01|   Test5|2021-01-01|         F|  2021-01-01|     SG|
|2021-02-01|   Test6|2021-01-31|         B|  2021-01-01|     SG|
+----------+--------+----------+----------+------------+-------+



In [16]:
columns = ['NUDGE_ID','BUSINESSDATE']
for column in columns:
    datamodified = datamodified.drop(column)
    print("Dropped column " + column)

Dropped column NUDGE_ID
Dropped column BUSINESSDATE


In [17]:
datamodified.show()

+----------+----------+----------+-------+
| NUDGE_END|  AS_OF_DT|NUDGE_TYPE|COUNTRY|
+----------+----------+----------+-------+
|2021-01-01|2021-01-01|         B|     SG|
|2021-04-01|2021-03-31|         B|     SG|
|2021-03-01|2021-02-28|         F|     SG|
|2021-02-01|2021-29-01|         N|     SG|
|2021-01-01|2021-01-01|         F|     SG|
|2021-02-01|2021-01-31|         B|     SG|
+----------+----------+----------+-------+

