In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config("spark.sql.warehouse.dir", f"/user/itv010698/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
loans_defaulters_df = spark.read \
.format("csv") \
.option("inferSchema", "true") \
.option("header", "true") \
.load("lending_club_project/raw_data/loan_defaulters_csv")

In [3]:
loans_defaulters_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
743530141063dc33d...,0.0,0.0,0.0,0.0,2.0,0.0,,
2ac41c2d9f4966188...,0.0,0.0,0.0,0.0,1.0,0.0,,
9f0145c753ad6b157...,0.0,0.0,0.0,0.0,0.0,0.0,33.0,
f180e481ba7755b25...,0.0,0.0,0.0,0.0,0.0,0.0,,
0f12ba052807101e7...,0.0,0.0,0.0,0.0,2.0,0.0,,
69e61c82e93037ace...,0.0,0.0,0.0,0.0,0.0,0.0,,
f72dbbdb8851007aa...,0.0,0.0,0.0,0.0,0.0,0.0,,
eca8525ceb6912b95...,0.0,0.0,0.0,0.0,1.0,0.0,77.0,
508dfaa28e2bbb74d...,0.0,0.0,0.0,0.0,0.0,0.0,,
b44aca52f973c7090...,0.0,0.0,0.0,0.0,1.0,0.0,,


In [4]:
loans_defaulters_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [7]:
loans_defaulters_df.createOrReplaceTempView("loan_defaulters")

In [8]:
spark.sql("select distinct(delinq_2yrs) from loan_defaulters")

delinq_2yrs
1.0
I bike to work on...
271 monthly payme...
VISA and AMEX cre...
etc. and I feel t...
183xx
AZ
017xx
923xx
446xx


In [9]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc").show(40)

+------------------+-------+
|       delinq_2yrs|  total|
+------------------+-------+
|               0.0|1838878|
|               1.0| 281335|
|               2.0|  81285|
|               3.0|  29539|
|               4.0|  13179|
|               5.0|   6599|
|               6.0|   3717|
|               7.0|   2062|
|               8.0|   1223|
|               9.0|    818|
|              10.0|    556|
|              11.0|    363|
|              12.0|    264|
|              13.0|    165|
|              14.0|    120|
|              15.0|     87|
|              null|     63|
|              16.0|     55|
|              18.0|     30|
|              17.0|     30|
|              19.0|     23|
|              20.0|     17|
|              21.0|     12|
|                CA|      8|
|                TX|      6|
|    small_business|      5|
|                IL|      5|
|debt_consolidation|      5|
|              22.0|      5|
|                FL|      4|
|              24.0|      4|
|             

### delinq_2yrs is how many times person has defaulted the loan in last 2 yrs. It cannot be string. Upon making it float, string values will be casted to null

In [10]:
schema = """member_id string, delinq_times_last_2yrs float, delinq_amnt float,
public_records float, public_records_bankruptcies float, enquiry_last_6mths float,
total_records_late_fee float, mths_since_last_delinq float, mths_since_last_public_record
float"""

In [11]:
loans_defaulters_df = spark.read \
.format("csv") \
.schema(schema) \
.option("header", "true") \
.load("lending_club_project/raw_data/loan_defaulters_csv")

In [12]:
loans_defaulters_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_times_last_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- public_records: float (nullable = true)
 |-- public_records_bankruptcies: float (nullable = true)
 |-- enquiry_last_6mths: float (nullable = true)
 |-- total_records_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_public_record: float (nullable = true)



In [15]:
loans_defaulters_df.createOrReplaceTempView("loan_defaulters")

In [16]:
spark.sql("select delinq_times_last_2yrs, count(*) as total from loan_defaulters group by delinq_times_last_2yrs order by total desc").show(40)

+----------------------+-------+
|delinq_times_last_2yrs|  total|
+----------------------+-------+
|                   0.0|1838878|
|                   1.0| 281335|
|                   2.0|  81285|
|                   3.0|  29539|
|                   4.0|  13179|
|                   5.0|   6599|
|                   6.0|   3717|
|                   7.0|   2062|
|                   8.0|   1223|
|                   9.0|    818|
|                  10.0|    556|
|                  11.0|    363|
|                  12.0|    264|
|                  null|    261|
|                  13.0|    165|
|                  14.0|    120|
|                  15.0|     87|
|                  16.0|     55|
|                  18.0|     30|
|                  17.0|     30|
|                  19.0|     23|
|                  20.0|     17|
|                  21.0|     12|
|                  22.0|      5|
|                  24.0|      4|
|                  26.0|      3|
|                  29.0|      2|
|         

### delinq_2yrs should be integer + convert nulls to 0

In [18]:
from pyspark.sql.functions import *

In [19]:
loans_defaulters_delinq_cleaned = loans_defaulters_df.withColumn("delinq_times_last_2yrs", col("delinq_times_last_2yrs").cast("integer")).fillna(0, subset = ["delinq_times_last_2yrs"])

In [20]:
loans_defaulters_delinq_cleaned.createOrReplaceTempView("loans_def")

In [21]:
spark.sql("select count(*) FROM loans_def WHERE delinq_times_last_2yrs is null")

count(1)
0


In [22]:
spark.sql("select delinq_times_last_2yrs, count(*) as total from loans_def group by delinq_times_last_2yrs order by total desc").show(40)

+----------------------+-------+
|delinq_times_last_2yrs|  total|
+----------------------+-------+
|                     0|1839141|
|                     1| 281337|
|                     2|  81285|
|                     3|  29545|
|                     4|  13180|
|                     5|   6601|
|                     6|   3719|
|                     7|   2063|
|                     8|   1226|
|                     9|    821|
|                    10|    558|
|                    11|    363|
|                    12|    266|
|                    13|    167|
|                    14|    123|
|                    15|     90|
|                    16|     56|
|                    17|     33|
|                    18|     32|
|                    19|     24|
|                    20|     19|
|                    21|     16|
|                    22|      7|
|                    24|      6|
|                    23|      5|
|                    26|      4|
|                    29|      2|
|         

### Making a seperate table for storing delinq (defaulters) information. Table will have only people who have defaulted earlier

In [23]:
loans_defaulters_delinq = spark.sql("SELECT member_id, delinq_times_last_2yrs, delinq_amnt, int(mths_since_last_delinq) FROM loans_def WHERE mths_since_last_delinq > 0 or delinq_times_last_2yrs > 0")

In [24]:
loans_defaulters_delinq.show()

+--------------------+----------------------+-----------+----------------------+
|           member_id|delinq_times_last_2yrs|delinq_amnt|mths_since_last_delinq|
+--------------------+----------------------+-----------+----------------------+
|9f0145c753ad6b157...|                     0|        0.0|                    33|
|eca8525ceb6912b95...|                     0|        0.0|                    77|
|004d932e272711fc3...|                     0|        0.0|                    80|
|3c90fb90165b043bc...|                     0|        0.0|                    50|
|b16807389deb10499...|                     0|        0.0|                    38|
|d1a222d79b8ef6344...|                     0|       null|                   755|
|b89659eabd6a0e437...|                     0|        0.0|                    45|
|0491632a6b1ff27b4...|                    18|        0.0|                     0|
|c926cdfbd9366199f...|                     0|        0.0|                    44|
|bc447e99e706ff96a...|      

In [25]:
loans_defaulters_delinq.count()

1106163

### Getting member_id of people who have any public record or bankruptcies or enquiry in last 6 months

In [26]:
loans_def_records_df = spark.sql("SELECT member_id FROM loans_def WHERE public_records>0.0 or public_records_bankruptcies> 0.0 or enquiry_last_6mths > 0.0")

In [27]:
loans_def_records_df

member_id
743530141063dc33d...
2ac41c2d9f4966188...
0f12ba052807101e7...
eca8525ceb6912b95...
b44aca52f973c7090...
d832fb44028eeb73d...
004d932e272711fc3...
b16807389deb10499...
d1a222d79b8ef6344...
1b6414f12deb57148...


### Write these two tables back (delinq_info and public_records member_ids)

In [28]:
loans_defaulters_delinq.write \
.format("csv") \
.option("header", "true") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_delinq_csv") \
.save()

In [29]:
loans_defaulters_delinq.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_delinq_parquet") \
.save()

In [33]:
loans_def_records_df.write \
.format("csv") \
.option("header", "true") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_records_enq_csv") \
.save()

In [None]:
loans_def_records_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_records_enq_parquet") \
.save()

### Now we got the requirement that for loan_defaulters_records_enq_csv table, we not only need member_id but also 3 columns with that.
1. member_id
2. public_records
3. public_records_bankruptcies
4. enquiry_last_6mths

#### We didn't clean these 3 columns. Let's clean them
#### Just casting these columns to integer and casting all the nulls to 0

In [13]:
loans_defaulters_pub_rec_cleaned = loans_defaulters_delinq_cleaned.withColumn("public_records", col("public_records").cast("integer")).fillna(0, subset = ["public_records"])

In [14]:
loans_defaulters_pub_rec_bankrupt_cleaned = loans_defaulters_pub_rec_cleaned.withColumn("public_records_bankruptcies", col("public_records_bankruptcies").cast("integer")).fillna(0, subset = ["public_records_bankruptcies"])

In [15]:
loans_defaulters_enq_cleaned = loans_defaulters_pub_rec_bankrupt_cleaned.withColumn("enquiry_last_6mths", col("enquiry_last_6mths").cast("integer")).fillna(0, subset = ["enquiry_last_6mths"])

### Make a table which has member_id, public_records, public_records_bankruptcies, enquiry_last_6mths
This table can contain members who have all these 0 as well

In [16]:
loans_defaulters_enq_cleaned.createOrReplaceTempView("loan_defaulters_detail_records_enq_csv")

In [18]:
loan_defaulters_detail_records_enq_csv = spark.sql("SELECT member_id, public_records, public_records_bankruptcies, enquiry_last_6mths FROM loan_defaulters_detail_records_enq_csv")

In [19]:
loan_defaulters_detail_records_enq_csv.show()

+--------------------+--------------+---------------------------+------------------+
|           member_id|public_records|public_records_bankruptcies|enquiry_last_6mths|
+--------------------+--------------+---------------------------+------------------+
|743530141063dc33d...|             0|                          0|                 2|
|2ac41c2d9f4966188...|             0|                          0|                 1|
|9f0145c753ad6b157...|             0|                          0|                 0|
|f180e481ba7755b25...|             0|                          0|                 0|
|0f12ba052807101e7...|             0|                          0|                 2|
|69e61c82e93037ace...|             0|                          0|                 0|
|f72dbbdb8851007aa...|             0|                          0|                 0|
|eca8525ceb6912b95...|             0|                          0|                 1|
|508dfaa28e2bbb74d...|             0|                          0|

### Write this table back

In [20]:
loan_defaulters_detail_records_enq_csv.write \
.format("csv") \
.option("header", "true") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_detail_records_enq_csv") \
.save()

In [21]:
loan_defaulters_detail_records_enq_csv.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "lending_club_project/cleaned_data/loan_defaulters_detail_records_enq_parquet") \
.save()