This notebook generates a report that finds latest file delivered for each file type

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window as W
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data = (
    ("2024-01-15", "ab_borrower_3dfk761.csv", "borrower", "2024-01-31"),
    ("2024-01-31", "ab_borrower_3m12m.csv", "borrower", "2024-01-31"),
    ("2024-01-31", "ab_borrower_45o1ka.csv", "borrower", "2024-02-05"),
    ("2024-01-31", "ab_pipeline_3m12m.csv", "pipeline", "2024-02-03"),
    ("2024-02-29", "ab_borrower_1m12m.csv", "borrower", "2024-03-02"),
    ("2024-02-29", "ab_borrower_ga5o1ka.csv", "borrower", "2024-03-12")
)

columns = ["AR1 Date", "filename", "file_type", "date_from_filename"]

month_end_report = spark.createDataFrame(data = data, schema = columns)
month_end_report.show(truncate=False)

+----------+-----------------------+---------+------------------+
|AR1 Date  |filename               |file_type|date_from_filename|
+----------+-----------------------+---------+------------------+
|2024-01-15|ab_borrower_3dfk761.csv|borrower |2024-01-31        |
|2024-01-31|ab_borrower_3m12m.csv  |borrower |2024-01-31        |
|2024-01-31|ab_borrower_45o1ka.csv |borrower |2024-02-05        |
|2024-01-31|ab_pipeline_3m12m.csv  |pipeline |2024-02-03        |
|2024-02-29|ab_borrower_1m12m.csv  |borrower |2024-03-02        |
|2024-02-29|ab_borrower_ga5o1ka.csv|borrower |2024-03-12        |
+----------+-----------------------+---------+------------------+



In [13]:
month_end_report = (
         month_end_report
         .withColumn("AR1 Date",
                     F.to_date(F.col("AR1 Date"),
                                           "yyyy-MM-dd"))
)
month_end_report = (
    month_end_report
    .withColumn ("date_from_filename",
                 F.to_date(F.col("date_from_filename"),
                                                "yyyy-MM-dd"))
)

In [5]:
#For each month and each file type, check if there are rows in the input where AR1 Date equals the last day of that month.
#From all those rows, take the most recent one.

last_day = month_end_report.withColumn("last_day_of_month", F.last_day(F.col("AR1 Date")))
rows_last_day = last_day.filter(F.col("AR1 Date") == F.col("last_day_of_month"))
rows_last_day.show(truncate=False)

+----------+-----------------------+---------+------------------+-----------------+
|AR1 Date  |filename               |file_type|date_from_filename|last_day_of_month|
+----------+-----------------------+---------+------------------+-----------------+
|2024-01-31|ab_borrower_3m12m.csv  |borrower |2024-01-31        |2024-01-31       |
|2024-01-31|ab_borrower_45o1ka.csv |borrower |2024-02-05        |2024-01-31       |
|2024-01-31|ab_pipeline_3m12m.csv  |pipeline |2024-02-03        |2024-01-31       |
|2024-02-29|ab_borrower_1m12m.csv  |borrower |2024-03-02        |2024-02-29       |
|2024-02-29|ab_borrower_ga5o1ka.csv|borrower |2024-03-12        |2024-02-29       |
+----------+-----------------------+---------+------------------+-----------------+



In [6]:
AR1 = (
    W
    .partitionBy("AR1 Date", "file_type")
    .orderBy (F.col("date_from_filename")
    .desc())
)
sorted_df = rows_last_day.withColumn ("filename_arrival_seq", F.row_number().over(AR1))
sorted_df.show(truncate=False)

+----------+-----------------------+---------+------------------+-----------------+--------------------+
|AR1 Date  |filename               |file_type|date_from_filename|last_day_of_month|filename_arrival_seq|
+----------+-----------------------+---------+------------------+-----------------+--------------------+
|2024-01-31|ab_borrower_45o1ka.csv |borrower |2024-02-05        |2024-01-31       |1                   |
|2024-01-31|ab_borrower_3m12m.csv  |borrower |2024-01-31        |2024-01-31       |2                   |
|2024-01-31|ab_pipeline_3m12m.csv  |pipeline |2024-02-03        |2024-01-31       |1                   |
|2024-02-29|ab_borrower_ga5o1ka.csv|borrower |2024-03-12        |2024-02-29       |1                   |
|2024-02-29|ab_borrower_1m12m.csv  |borrower |2024-03-02        |2024-02-29       |2                   |
+----------+-----------------------+---------+------------------+-----------------+--------------------+



In [7]:
current_state = sorted_df.where (F.col("filename_arrival_seq") == 1)
current_state.show()

+----------+--------------------+---------+------------------+-----------------+--------------------+
|  AR1 Date|            filename|file_type|date_from_filename|last_day_of_month|filename_arrival_seq|
+----------+--------------------+---------+------------------+-----------------+--------------------+
|2024-01-31|ab_borrower_45o1k...| borrower|        2024-02-05|       2024-01-31|                   1|
|2024-01-31|ab_pipeline_3m12m...| pipeline|        2024-02-03|       2024-01-31|                   1|
|2024-02-29|ab_borrower_ga5o1...| borrower|        2024-03-12|       2024-02-29|                   1|
+----------+--------------------+---------+------------------+-----------------+--------------------+



In [24]:
from datetime import datetime, timedelta
import dateutil.relativedelta

today = datetime.now() - dateutil.relativedelta.relativedelta(months=5)
fmt = "%Y-%m"

expected_dates = [
    today.strftime (fmt),
    (today - dateutil.relativedelta.relativedelta(months=1)).strftime (fmt),
    (today - dateutil.relativedelta.relativedelta(months=2)).strftime (fmt)
]
expected_file_types = ["borrower", "pipeline"]

In [25]:

expected_data = []
for n in expected_dates:
    for file_type in expected_file_types:
        expected_data.append(
            (n, file_type)
        )
expected_data

[('2024-02', 'borrower'),
 ('2024-02', 'pipeline'),
 ('2024-01', 'borrower'),
 ('2024-01', 'pipeline'),
 ('2023-12', 'borrower'),
 ('2023-12', 'pipeline')]

In [26]:
columns = ["Relevant Month", "file_type"]
expected = spark.createDataFrame(data = expected_data, schema = columns)
expected.show()

+--------------+---------+
|Relevant Month|file_type|
+--------------+---------+
|       2024-02| borrower|
|       2024-02| pipeline|
|       2024-01| borrower|
|       2024-01| pipeline|
|       2023-12| borrower|
|       2023-12| pipeline|
+--------------+---------+



In [27]:
current_state = current_state.withColumnRenamed ("AR1 Date", "Relevant Month")
current_state = current_state.withColumn("Relevant Month", F.date_format ("Relevant Month", "yyyy-MM"))
current_state.show()

+--------------+--------------------+---------+------------------+-----------------+--------------------+
|Relevant Month|            filename|file_type|date_from_filename|last_day_of_month|filename_arrival_seq|
+--------------+--------------------+---------+------------------+-----------------+--------------------+
|       2024-01|ab_borrower_45o1k...| borrower|        2024-02-05|       2024-01-31|                   1|
|       2024-01|ab_pipeline_3m12m...| pipeline|        2024-02-03|       2024-01-31|                   1|
|       2024-02|ab_borrower_ga5o1...| borrower|        2024-03-12|       2024-02-29|                   1|
+--------------+--------------------+---------+------------------+-----------------+--------------------+



In [28]:
final = expected.join (
    current_state,
    on = ["file_type", "Relevant Month"],
    how = "left"
)
final = final.sort ("Relevant Month")
final.show()

+---------+--------------+--------------------+------------------+-----------------+--------------------+
|file_type|Relevant Month|            filename|date_from_filename|last_day_of_month|filename_arrival_seq|
+---------+--------------+--------------------+------------------+-----------------+--------------------+
| borrower|       2023-12|                NULL|              NULL|             NULL|                NULL|
| pipeline|       2023-12|                NULL|              NULL|             NULL|                NULL|
| borrower|       2024-01|ab_borrower_45o1k...|        2024-02-05|       2024-01-31|                   1|
| pipeline|       2024-01|ab_pipeline_3m12m...|        2024-02-03|       2024-01-31|                   1|
| borrower|       2024-02|ab_borrower_ga5o1...|        2024-03-12|       2024-02-29|                   1|
| pipeline|       2024-02|                NULL|              NULL|             NULL|                NULL|
+---------+--------------+--------------------