## Cleaning loans_repayments Data

In [0]:
loans_repay_schema = 'loan_id string, total_principal_received float, total_interest_received float, total_late_fee_received float, total_payment_received float, last_payment_amount float, last_payment_date string, next_payment_date string'

In [0]:
loans_repay_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_repay_schema) \
.load("/mnt/Lendingclub/Lendingclub/Lendingclub/raw/loans_repayments_csv")

In [0]:
display(loans_repay_raw_df.limit(10))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
68407277,3600.0,821.72,0.0,4421.724,122.67,Jan-2019,
68355089,24700.0,979.66,0.0,25679.66,926.35,Jun-2016,
68341763,20000.0,2705.92,0.0,22705.924,15813.3,Jun-2017,
66310712,19102.35,12361.66,0.0,31464.01,829.9,Feb-2019,Apr-2019
68476807,10400.0,1340.5,0.0,11740.5,10128.96,Jul-2016,
68426831,11950.0,1758.95,0.0,13708.948,7653.56,May-2017,
68476668,20000.0,1393.8,0.0,21393.8,15681.05,Nov-2016,
67275481,20000.0,1538.51,0.0,21538.51,14618.23,Jan-2017,
68466926,10000.0,998.97,0.0,10998.972,1814.48,Aug-2018,
68616873,8000.0,939.58,0.0,8939.58,4996.24,Apr-2017,


In [0]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
loans_repay_df_ingestd = loans_repay_raw_df.withColumn("ingest_date", current_timestamp())

In [0]:
display(loans_repay_df_ingestd.limit(10))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
68407277,3600.0,821.72,0.0,4421.724,122.67,Jan-2019,,2025-08-15T04:58:17.405541Z
68355089,24700.0,979.66,0.0,25679.66,926.35,Jun-2016,,2025-08-15T04:58:17.405541Z
68341763,20000.0,2705.92,0.0,22705.924,15813.3,Jun-2017,,2025-08-15T04:58:17.405541Z
66310712,19102.35,12361.66,0.0,31464.01,829.9,Feb-2019,Apr-2019,2025-08-15T04:58:17.405541Z
68476807,10400.0,1340.5,0.0,11740.5,10128.96,Jul-2016,,2025-08-15T04:58:17.405541Z
68426831,11950.0,1758.95,0.0,13708.948,7653.56,May-2017,,2025-08-15T04:58:17.405541Z
68476668,20000.0,1393.8,0.0,21393.8,15681.05,Nov-2016,,2025-08-15T04:58:17.405541Z
67275481,20000.0,1538.51,0.0,21538.51,14618.23,Jan-2017,,2025-08-15T04:58:17.405541Z
68466926,10000.0,998.97,0.0,10998.972,1814.48,Aug-2018,,2025-08-15T04:58:17.405541Z
68616873,8000.0,939.58,0.0,8939.58,4996.24,Apr-2017,,2025-08-15T04:58:17.405541Z


In [0]:
loans_repay_df_ingestd.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [0]:
loans_repay_df_ingestd.count()

2260701

In [0]:
loans_repay_df_ingestd.createOrReplaceTempView("loan_repayments")

In [0]:
display(spark.sql("select count(*) from loan_repayments where total_principal_received is null"))

count(1)
69


In [0]:
columns_to_check = ["total_principal_received", "total_interest_received", "total_late_fee_received", "total_payment_received", "last_payment_amount"]

In [0]:
loans_repay_filtered_df = loans_repay_df_ingestd.na.drop(subset=columns_to_check)

In [0]:
loans_repay_filtered_df.count()

2260498

In [0]:
loans_repay_filtered_df.createOrReplaceTempView("loan_repayments")

In [0]:
display(spark.sql("select count(*) from loan_repayments where total_payment_received = 0.0"))

count(1)
995


In [0]:
display(spark.sql("select count(*) from loan_repayments where total_payment_received = 0.0 and total_principal_received != 0.0"))

count(1)
46


In [0]:
display(spark.sql("select * from loan_repayments where total_payment_received = 0.0 and total_principal_received != 0.0").limit(10))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,0.0,0.0,0.0,Dec-2014,2025-08-15T04:59:31.958732Z
516382,21890.229,21856.03,16000.0,0.0,0.0,0.0,Mar-2014,2025-08-15T04:59:31.958732Z
528899,3045.0364,3019.64,2500.0,0.0,0.0,0.0,Jan-2013,2025-08-15T04:59:31.958732Z
527598,2398.9092,2220.51,2200.0,0.0,0.0,0.0,Jul-2011,2025-08-15T04:59:31.958732Z
525697,21797.86,19894.9,15750.0,0.0,0.0,0.0,Jun-2015,2025-08-15T04:59:31.958732Z
522641,3146.8193,3146.82,3000.0,0.0,0.0,0.0,Sep-2011,2025-08-15T04:59:31.958732Z
515655,29938.576,29905.75,22800.0,0.0,0.0,0.0,May-2013,2025-08-15T04:59:31.958732Z
501234,15219.313,15155.9,12000.0,0.0,0.0,0.0,May-2013,2025-08-15T04:59:31.958732Z
498194,11642.714,11031.47,10000.0,0.0,0.0,0.0,Jan-2013,2025-08-15T04:59:31.958732Z
495171,11138.843,10024.96,10000.0,0.0,0.0,0.0,Apr-2013,2025-08-15T04:59:31.958732Z


In [0]:
from pyspark.sql.functions import when, col

In [0]:
loans_payments_fixed_df = loans_repay_filtered_df.withColumn(
   "total_payment_received",
    when(
        (col("total_principal_received") != 0.0) &
        (col("total_payment_received") == 0.0),
        col("total_principal_received") + col("total_interest_received") + col("total_late_fee_received")
    ).otherwise(col("total_payment_received"))
)

In [0]:
display(loans_payments_fixed_df.limit(10))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
68407277,3600.0,821.72,0.0,4421.724,122.67,Jan-2019,,2025-08-15T04:59:57.659324Z
68355089,24700.0,979.66,0.0,25679.66,926.35,Jun-2016,,2025-08-15T04:59:57.659324Z
68341763,20000.0,2705.92,0.0,22705.924,15813.3,Jun-2017,,2025-08-15T04:59:57.659324Z
66310712,19102.35,12361.66,0.0,31464.01,829.9,Feb-2019,Apr-2019,2025-08-15T04:59:57.659324Z
68476807,10400.0,1340.5,0.0,11740.5,10128.96,Jul-2016,,2025-08-15T04:59:57.659324Z
68426831,11950.0,1758.95,0.0,13708.948,7653.56,May-2017,,2025-08-15T04:59:57.659324Z
68476668,20000.0,1393.8,0.0,21393.8,15681.05,Nov-2016,,2025-08-15T04:59:57.659324Z
67275481,20000.0,1538.51,0.0,21538.51,14618.23,Jan-2017,,2025-08-15T04:59:57.659324Z
68466926,10000.0,998.97,0.0,10998.972,1814.48,Aug-2018,,2025-08-15T04:59:57.659324Z
68616873,8000.0,939.58,0.0,8939.58,4996.24,Apr-2017,,2025-08-15T04:59:57.659324Z


In [0]:
display(loans_payments_fixed_df.filter("loan_id == '1064185'"))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,33201.96,0.0,0.0,Dec-2014,2025-08-15T05:00:07.319741Z


In [0]:
loans_payments_fixed2_df = loans_payments_fixed_df.filter("total_payment_received != 0.0")

In [0]:
display(loans_payments_fixed2_df.limit(10))

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
68407277,3600.0,821.72,0.0,4421.724,122.67,Jan-2019,,2025-08-15T05:00:24.600762Z
68355089,24700.0,979.66,0.0,25679.66,926.35,Jun-2016,,2025-08-15T05:00:24.600762Z
68341763,20000.0,2705.92,0.0,22705.924,15813.3,Jun-2017,,2025-08-15T05:00:24.600762Z
66310712,19102.35,12361.66,0.0,31464.01,829.9,Feb-2019,Apr-2019,2025-08-15T05:00:24.600762Z
68476807,10400.0,1340.5,0.0,11740.5,10128.96,Jul-2016,,2025-08-15T05:00:24.600762Z
68426831,11950.0,1758.95,0.0,13708.948,7653.56,May-2017,,2025-08-15T05:00:24.600762Z
68476668,20000.0,1393.8,0.0,21393.8,15681.05,Nov-2016,,2025-08-15T05:00:24.600762Z
67275481,20000.0,1538.51,0.0,21538.51,14618.23,Jan-2017,,2025-08-15T05:00:24.600762Z
68466926,10000.0,998.97,0.0,10998.972,1814.48,Aug-2018,,2025-08-15T05:00:24.600762Z
68616873,8000.0,939.58,0.0,8939.58,4996.24,Apr-2017,,2025-08-15T05:00:24.600762Z


In [0]:
loans_payments_fixed2_df.filter("last_payment_date = '0.0'").count()

48

In [0]:
loans_payments_fixed2_df.filter("next_payment_date = '0.0'").count()

24

In [0]:
loans_payments_fixed2_df.filter("last_payment_date is null").count()

1477

In [0]:
loans_payments_fixed2_df.filter("next_payment_date is null").count()

1344240

In [0]:
loans_payments_ldate_fixed_df = loans_payments_fixed2_df.withColumn(
  "last_payment_date",
   when(
       (col("last_payment_date") == '0.0'),
       None
       ).otherwise(col("last_payment_date"))
)

In [0]:
loans_payments_ndate_fixed_df = loans_payments_ldate_fixed_df.withColumn(
  "last_payment_date",
   when(
       (col("next_payment_date") == '0.0'),
       None
       ).otherwise(col("next_payment_date"))
)

In [0]:
loans_payments_ndate_fixed_df.filter("last_payment_date = '0.0'").count()

0

In [0]:
loans_payments_ndate_fixed_df.filter("next_payment_date = '0.0'").count()

24

In [0]:
loans_payments_ndate_fixed_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/Lendingclub/Lendingclub/Lendingclub/cleaned/loans_repayments_parquet") \
.save()