The goal of this task is to generate a report that ranks borrowers associated with each mortgage based on their age, and then create a wide-format dataset where each mortgage lists its borrowers in order of their age.

In [19]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

spark = ( 
    SparkSession.builder 
    .appName("Mortgage Analysis") 
    .getOrCreate() 
)


data = [
    (1, 2, "AAA", "1900-01-01"),
    (1, 1, "BBB", "1900-05-01"),
    (2, 3, "DDD", "1950-10-20")
]

borrower_dob = spark.createDataFrame(data, ["Mortgage ID", "Borrower ID", "Borrower Name", "Borrower DOB"])
borrower_dob.show()


+-----------+-----------+-------------+------------+
|Mortgage ID|Borrower ID|Borrower Name|Borrower DOB|
+-----------+-----------+-------------+------------+
|          1|          2|          AAA|  1900-01-01|
|          1|          1|          BBB|  1900-05-01|
|          2|          3|          DDD|  1950-10-20|
+-----------+-----------+-------------+------------+



In [20]:
borrower_dob_order_window = Window.partitionBy("Mortgage ID").orderBy("Borrower DOB")
borrower_dob = borrower_dob.withColumn("Borrower sequence", F.row_number().over(borrower_dob_order_window))
borrower_dob.show()

+-----------+-----------+-------------+------------+-----------------+
|Mortgage ID|Borrower ID|Borrower Name|Borrower DOB|Borrower sequence|
+-----------+-----------+-------------+------------+-----------------+
|          1|          2|          AAA|  1900-01-01|                1|
|          1|          1|          BBB|  1900-05-01|                2|
|          2|          3|          DDD|  1950-10-20|                1|
+-----------+-----------+-------------+------------+-----------------+



In [21]:
group1=borrower_dob.where(F.col("Borrower sequence") == 1)
first_borrower_df=(
     group1
    .withColumnRenamed("Borrower ID","First Borrower ID") 
    .withColumnRenamed("Borrower Name","First Borrower Name")
)

group2=borrower_dob.where(F.col("Borrower sequence")==2)

secnd_borrower_df=(
      group2
     .withColumnRenamed("Borrower ID","Second Borrower ID") 
    .withColumnRenamed("Borrower Name","Second Borrower Name")
)

first_borrower_df.show()
secnd_borrower_df.show()




+-----------+-----------------+-------------------+------------+-----------------+
|Mortgage ID|First Borrower ID|First Borrower Name|Borrower DOB|Borrower sequence|
+-----------+-----------------+-------------------+------------+-----------------+
|          1|                2|                AAA|  1900-01-01|                1|
|          2|                3|                DDD|  1950-10-20|                1|
+-----------+-----------------+-------------------+------------+-----------------+

+-----------+------------------+--------------------+------------+-----------------+
|Mortgage ID|Second Borrower ID|Second Borrower Name|Borrower DOB|Borrower sequence|
+-----------+------------------+--------------------+------------+-----------------+
|          1|                 1|                 BBB|  1900-05-01|                2|
+-----------+------------------+--------------------+------------+-----------------+



In [22]:
joined=(
    first_borrower_df
    .join(secnd_borrower_df,"Mortgage ID","left")
    .select(
        "Mortgage ID","First Borrower ID","First Borrower Name",
        "Second Borrower ID","Second Borrower Name"
    )
)
joined.show()


+-----------+-----------------+-------------------+------------------+--------------------+
|Mortgage ID|First Borrower ID|First Borrower Name|Second Borrower ID|Second Borrower Name|
+-----------+-----------------+-------------------+------------------+--------------------+
|          1|                2|                AAA|                 1|                 BBB|
|          2|                3|                DDD|              NULL|                NULL|
+-----------+-----------------+-------------------+------------------+--------------------+

