#### 1. create a dataframe with proper datatypes and names

In [0]:
loans_schema = 'loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_months string, interest_rate float, monthly_installment float, issue_date string, loan_status string, loan_purpose string, loan_title string'

In [0]:
loans_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_schema) \
.load("/mnt/Lendingclub/Lendingclub/Lendingclub/raw/loans_data_csv")

In [0]:
display(loans_raw_df.limit(10))

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
68407277,6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
68355089,b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business
68341763,91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,
66310712,cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation
68476807,f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase
68426831,8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
68476668,538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation
67275481,b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase
68466926,1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refinancing
68616873,cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refinancing


In [0]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_timestamp

#### 2. insert a new column named as ingestion date(current time)

In [0]:
loans_df_ingestd = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [0]:
display(loans_df_ingestd.limit(10))

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:47:40.089807Z
68355089,b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2025-08-15T04:47:40.089807Z
68341763,91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2025-08-15T04:47:40.089807Z
66310712,cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2025-08-15T04:47:40.089807Z
68476807,f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:47:40.089807Z
68426831,8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:47:40.089807Z
68476668,538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:47:40.089807Z
67275481,b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:47:40.089807Z
68466926,1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:47:40.089807Z
68616873,cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:47:40.089807Z


In [0]:
loans_df_ingestd.createOrReplaceTempView("loans")

In [0]:
display(spark.sql("select count(*) from loans"))

count(1)
2260701


In [0]:
display(spark.sql("select * from loans where loan_amount is null"))

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount funded in policy code 1: 6417608175,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 2: 1944088810,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 1: 1741781700,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 2: 564202131,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 1: 1791201400,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 2: 651669342,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 1: 1443412975,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 2: 511988838,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 1: 2063142975,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z
Total amount funded in policy code 2: 823319310,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,,,,,,,,,,2025-08-14T22:09:33.890283Z


#### 3. Dropping the rows which has null values in the mentioned columns

In [0]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

In [0]:
loans_filtered_df = loans_df_ingestd.na.drop(subset=columns_to_check)

In [0]:
loans_filtered_df.count()

2260667

In [0]:
loans_filtered_df.createOrReplaceTempView("loans")

In [0]:
display(loans_filtered_df.limit(10))

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,3600.0,3600.0,36 months,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:24.371149Z
68355089,b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,24700.0,24700.0,36 months,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2025-08-15T04:48:24.371149Z
68341763,91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,20000.0,20000.0,60 months,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2025-08-15T04:48:24.371149Z
66310712,cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,35000.0,35000.0,60 months,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2025-08-15T04:48:24.371149Z
68476807,f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,10400.0,10400.0,60 months,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:48:24.371149Z
68426831,8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,11950.0,11950.0,36 months,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:24.371149Z
68476668,538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,20000.0,20000.0,36 months,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:24.371149Z
67275481,b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,20000.0,20000.0,36 months,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:48:24.371149Z
68466926,1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,10000.0,10000.0,36 months,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:48:24.371149Z
68616873,cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,8000.0,8000.0,36 months,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:48:24.371149Z


#### 4. convert loan_term_months to integer

In [0]:
from pyspark.sql.functions import regexp_replace, col

In [0]:
loans_term_modified_df = loans_filtered_df.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), " months", "") \
.try_cast("int") / 12) \
.try_cast("int")) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [0]:
display(loans_term_modified_df.limit(10))

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
68407277,6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,3600.0,3600.0,3,13.99,123.03,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:38.403742Z
68355089,b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,24700.0,24700.0,3,11.99,820.28,Dec-2015,Fully Paid,small_business,Business,2025-08-15T04:48:38.403742Z
68341763,91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,20000.0,20000.0,5,10.78,432.66,Dec-2015,Fully Paid,home_improvement,,2025-08-15T04:48:38.403742Z
66310712,cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,35000.0,35000.0,5,14.85,829.9,Dec-2015,Current,debt_consolidation,Debt consolidation,2025-08-15T04:48:38.403742Z
68476807,f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,10400.0,10400.0,5,22.45,289.91,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:48:38.403742Z
68426831,8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,11950.0,11950.0,3,13.44,405.18,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:38.403742Z
68476668,538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,20000.0,20000.0,3,9.17,637.58,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,2025-08-15T04:48:38.403742Z
67275481,b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,20000.0,20000.0,3,8.49,631.26,Dec-2015,Fully Paid,major_purchase,Major purchase,2025-08-15T04:48:38.403742Z
68466926,1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,10000.0,10000.0,3,6.49,306.45,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:48:38.403742Z
68616873,cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,8000.0,8000.0,3,11.48,263.74,Dec-2015,Fully Paid,credit_card,Credit card refinancing,2025-08-15T04:48:38.403742Z


In [0]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



#### 5. Clean the loans_purpose column

In [0]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [0]:
display(spark.sql("select distinct(loan_purpose) from loans").limit(100))

loan_purpose
wedding
educational
other
small_business
debt_consolidation
credit_card
moving
vacation
renewable_energy
house


In [0]:
display(spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc").limit(100))

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [0]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [0]:
from pyspark.sql.functions import when

In [0]:
loans_purpose_modified = loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [0]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [0]:
display(spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc"))

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [0]:
from pyspark.sql.functions import count

In [0]:
display(loans_purpose_modified.groupBy("loan_purpose").agg(count("*").alias("total")).orderBy(col("total").desc()))

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [0]:
loans_purpose_modified.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/Lendingclub/Lendingclub/Lendingclub/cleaned/loans_parquet") \
.save()