In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config("spark.sql.warehouse.dir", f"/user/itv010698/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
loans_raw_df = spark.read \
.format("csv") \
.option("inferSchema", "true") \
.option("header", "true") \
.load("lending_club_project/raw_data/loans_data_csv")

In [3]:
loans_raw_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,Loan_status,purpose,title
109653473,bd7db5222de2a6ce4...,25475.0,25475.0,36 months,14.08,871.67,May-2017,Late (31-120 days),debt_consolidation,Debt consolidation
109677516,c0428bbd04ba8ba06...,14400.0,14400.0,36 months,12.62,482.57,May-2017,Current,debt_consolidation,Debt consolidation
109910042,028e91969c534dca2...,4000.0,4000.0,36 months,7.07,123.64,May-2017,Fully Paid,credit_card,Credit card refin...
109767343,fbab3a4ab0e73d30f...,30000.0,30000.0,36 months,17.09,1070.93,May-2017,Current,other,Other
109858215,342558ad61cfea940...,11500.0,11500.0,36 months,15.05,398.94,May-2017,Current,other,Other
109740125,372109bc6ae1c2d16...,25100.0,25100.0,60 months,16.02,610.65,May-2017,Fully Paid,home_improvement,Home improvement
108257437,54e9694e0b82951ee...,4800.0,4800.0,36 months,7.21,148.68,May-2017,Fully Paid,credit_card,Credit card refin...
109898089,1802301a402b207ca...,11925.0,11925.0,60 months,17.09,296.95,May-2017,Current,debt_consolidation,Debt consolidation
109253226,ec80b7f0e7bc69119...,15000.0,15000.0,60 months,15.05,357.25,May-2017,Current,debt_consolidation,Debt consolidation
109848461,f3ee70b6e652c7c27...,6400.0,6400.0,36 months,10.42,207.78,May-2017,Fully Paid,debt_consolidation,Debt consolidation


In [4]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- Loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)



### changing the schema

In [5]:
schema = "loan_id string, member_id string, loan_amount float, funded_amount float, term string, interest_rate float, installment float, issue_date string, loan_status string, loan_purpose string, loan_title string"

In [6]:
loans_raw_df = spark.read \
.format("csv") \
.schema(schema) \
.option("header", "true") \
.load("lending_club_project/raw_data/loans_data_csv")

In [7]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- term: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



### Add column ingest_date

In [8]:
from pyspark.sql.functions import *

In [9]:
loans_df_ingested = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [10]:
loans_df_ingested.createOrReplaceTempView("loans")

In [11]:
spark.sql("SELECT COUNT(*) FROM loans")

count(1)
2260701


### Drop null values

In [12]:
spark.sql("SELECT * FROM loans WHERE loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,term,interest_rate,installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Loans that do not...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-03-20 15:11:...


In [13]:
loans_na_dropped = loans_df_ingested.na.drop(how='any', subset = ["loan_amount", "funded_amount", "term","interest_rate","installment","issue_date","loan_status","loan_purpose"])

In [14]:
loans_na_dropped.createOrReplaceTempView("loans")

In [15]:
spark.sql("SELECT * FROM loans WHERE loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,term,interest_rate,installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date


### Convert term (loan term in months) to yearly term and convert it to integer

##### regexp_replace(column which you want to replace, string you want to replace, with what)

In [16]:
x = loans_na_dropped.withColumn("term", regexp_replace(loans_na_dropped.term, " months", "").cast("int"))

In [17]:
y = x.withColumn("term", expr("term / 12").cast("int"))

In [18]:
loans_term_year_changed = y.withColumnRenamed("term", "loan_term_years")

In [19]:
loans_term_year_changed

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
109653473,bd7db5222de2a6ce4...,25475.0,25475.0,3,14.08,871.67,May-2017,Late (31-120 days),debt_consolidation,Debt consolidation,2024-03-20 15:11:...
109677516,c0428bbd04ba8ba06...,14400.0,14400.0,3,12.62,482.57,May-2017,Current,debt_consolidation,Debt consolidation,2024-03-20 15:11:...
109910042,028e91969c534dca2...,4000.0,4000.0,3,7.07,123.64,May-2017,Fully Paid,credit_card,Credit card refin...,2024-03-20 15:11:...
109767343,fbab3a4ab0e73d30f...,30000.0,30000.0,3,17.09,1070.93,May-2017,Current,other,Other,2024-03-20 15:11:...
109858215,342558ad61cfea940...,11500.0,11500.0,3,15.05,398.94,May-2017,Current,other,Other,2024-03-20 15:11:...
109740125,372109bc6ae1c2d16...,25100.0,25100.0,5,16.02,610.65,May-2017,Fully Paid,home_improvement,Home improvement,2024-03-20 15:11:...
108257437,54e9694e0b82951ee...,4800.0,4800.0,3,7.21,148.68,May-2017,Fully Paid,credit_card,Credit card refin...,2024-03-20 15:11:...
109898089,1802301a402b207ca...,11925.0,11925.0,5,17.09,296.95,May-2017,Current,debt_consolidation,Debt consolidation,2024-03-20 15:11:...
109253226,ec80b7f0e7bc69119...,15000.0,15000.0,5,15.05,357.25,May-2017,Current,debt_consolidation,Debt consolidation,2024-03-20 15:11:...
109848461,f3ee70b6e652c7c27...,6400.0,6400.0,3,10.42,207.78,May-2017,Fully Paid,debt_consolidation,Debt consolidation,2024-03-20 15:11:...


In [20]:
loans_term_year_changed.createOrReplaceTempView("loans")

### loan_purpose should be within this list only. All other will be include in other category
loan_purpose_lookup = ["debt_consolidation", "credit_card",
"home_improvement", "other", "major_purchase", "medical", "small_business",
"car", "vacation", "moving", "house", "wedding", "renewable_energy",
"educational"]

In [21]:
spark.sql("SELECT loan_purpose, COUNT(*) as count FROM loans GROUP BY loan_purpose order by count desc")

loan_purpose,count
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [22]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [23]:
loans_purpose_changed = loans_term_year_changed.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [24]:
loans_purpose_changed.createOrReplaceTempView("loans")

In [25]:
spark.sql("SELECT loan_purpose, COUNT(*) as count FROM loans GROUP BY loan_purpose order by count desc")

loan_purpose,count
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


### Put the dataframe to cleaned folder

In [28]:
loans_purpose_changed.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loans_cleaned_csv") \
.save()

In [None]:
loans_purpose_changed.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loans_cleaned_parquet") \
.save()