In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp,current_date,when,col

In [3]:
spark=SparkSession.builder.config("'spark.shuffle.useOldFetchProtocol', 'true'")\
.enableHiveSupport()\
.appName("Project_Pyspark")\
.master('yarn')\
.getOrCreate()

In [4]:
spark

In [5]:
loans_defaulters_raw_df=spark.read.format("csv").option("header",True).option("inferSchema",True)\
.load("/user/itv012010/lendingclubproject/raw/loans_defaulters_csv")

In [6]:
loans_defaulters_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
2209556c19f75a204...,0.0,0.0,1.0,1.0,0.0,0.0,,65.0
9d0d53b8de898b446...,0.0,0.0,1.0,1.0,2.0,0.0,25.0,83.0
1a978870f4a9b4b43...,0.0,0.0,0.0,0.0,0.0,0.0,,
c66ccc99ff4372809...,1.0,0.0,0.0,0.0,1.0,0.0,23.0,
91d00a422248120a4...,0.0,0.0,0.0,0.0,0.0,0.0,,
b7539a5579e723b21...,0.0,0.0,0.0,0.0,0.0,0.0,,
2fca945a60ef9246c...,0.0,0.0,0.0,0.0,1.0,47.8,,
8a6dababd11fd592f...,0.0,0.0,0.0,0.0,2.0,0.0,,
3af72ce29e878edd7...,0.0,0.0,0.0,0.0,1.0,0.0,,
4395baca2ea2d58a8...,0.0,0.0,0.0,0.0,0.0,0.0,37.0,


In [7]:
loans_defaulters_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [163]:
#apart from member_id other can be float 
#deling_2yrs means if the person has made delayed payment in last 2 yrs

In [33]:
loans_defaulters_schema="member_id string,delinq_2yrs float,delinq_amnt float,pub_rec float,pub_rec_bankruptcies float,enquiry_last_6mths float,\
                         total_rec_late_fee float,months_since_last_delinq float,months_since_last_record float"

In [34]:
loans_defaulters_raw_df=spark.read.format("csv").option("header",True).schema(loans_defaulters_schema)\
.load("/user/itv012010/lendingclubproject/raw/loans_defaulters_csv")

In [35]:
loans_defaulters_raw_df.printSchema() #changed name & datatype as required

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- enquiry_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- months_since_last_delinq: float (nullable = true)
 |-- months_since_last_record: float (nullable = true)



In [36]:
loans_defaulters_raw_df.createOrReplaceTempView("loans_defaulters")

In [37]:
spark.sql("select distinct(delinq_2yrs) from loans_defaulters") #there are many delinq_2yrs if we see

delinq_2yrs
20.04
18.53
18.0
26.24
6.52
9.0
21.72
58.0
17.17
5.0


In [38]:
spark.sql("""select delinq_2yrs,count(*) as total from loans_defaulters
          group by delinq_2yrs order by total desc""").show(30) # we can see many people have not defaulted any loan so first row is zero

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|        0.0|1838878|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       null|    261|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       18.0|     30|
|       17.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       29.0|      2|
|       23.0|      2|
|       3.44|      2|
|       30.0|      2|
+-----------+-------+
only showing top 30 rows



In [14]:
#we need to remove data with floating points only int shld be fine, if 
#we try changing this at begining we wont get any data bcz all are in float

In [39]:
loans_def_processed_df=loans_defaulters_raw_df.withColumn("delinq_2yrs",col("delinq_2yrs").cast('integer')).fillna(0,subset=['delinq_2yrs'])
#this will cast float to int & remove all na with zeros on delinq_2yrs column

In [16]:
loans_def_processed_df.createOrReplaceTempView("loans_defaulters")

In [17]:
spark.sql("""select delinq_2yrs,count(*) as total from loans_defaulters
          group by delinq_2yrs order by total desc""") #we can see only int values are available

delinq_2yrs,total
0,1839141
1,281337
2,81285
3,29545
4,13180
5,6601
6,3719
7,2063
8,1226
9,821


In [18]:
spark.sql("""select delinq_2yrs from loans_defaulters
          where delinq_2yrs is null""").show()#no nulls 

+-----------+
|delinq_2yrs|
+-----------+
+-----------+



In [19]:
#We will be dividing this data into two parts based on our requirements

In [20]:
loans_def_delinq_df=spark.sql("""select member_id,delinq_2yrs,delinq_amnt,int(months_since_last_delinq) from loans_defaulters
                                 where  delinq_2yrs  > 0 or  months_since_last_delinq > 0 """)

In [21]:
loans_def_delinq_df.count()

1106163

In [22]:
loans_def_records_df=spark.sql("""select  member_id from loans_defaulters where
                                  pub_rec> 0.0 or pub_rec_bankruptcies >0.0
 """)

In [23]:
loans_def_records_df.count() #to get member_id with pub_rec_bankruptcies >0 

357989

In [24]:
loans_def_delinq_df.write.mode("overwrite").option("header",True).format("csv")\
.option("path","/user/itv012010/lendingclubproject/cleaned/loans_defaulters_delinq_csv").save()

In [25]:
loans_def_delinq_df.write.mode("overwrite").option("header",True).format("parquet")\
.option("path","/user/itv012010/lendingclubproject/cleaned/loans_defaulters_delinq_parquet").save()

In [26]:
loans_def_records_df.write.mode("overwrite").option("header",True).format("csv")\
.option("path","/user/itv012010/lendingclubproject/cleaned/loans_def_records_csv").save()

In [27]:
loans_def_records_df.write.mode("overwrite").option("header",True).format("parquet")\
.option("path","/user/itv012010/lendingclubproject/cleaned/loans_def_records_parquet").save()

Now we will clean all 3 main cols from loan defaulters which would be used in calculating loan Score
i.e.; pub_rec, pub_rec_bankruptcies, enquiry_last_6mths
replace na with zero in each of these cols. and create a table and store it in CSV,parquet format

In [46]:
loans_def_p_pub_rec_df = loans_def_processed_df.withColumn("pub_rec", col("pub_rec").cast("integer")).fillna(0, subset = ["pub_rec"])

In [47]:
loans_def_p_pub_rec_bankruptcies_df = loans_def_p_pub_rec_df.withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies")\
.cast("integer")).fillna(0, subset = ["pub_rec_bankruptcies"])

In [49]:
loans_def_p_inq_last_6mths_df = loans_def_p_pub_rec_bankruptcies_df.withColumn("enquiry_last_6mths", col("enquiry_last_6mths")\
.cast("integer")).fillna(0, subset = ["enquiry_last_6mths"])

In [50]:
loans_def_p_inq_last_6mths_df.createOrReplaceTempView("loan_defaulters")

In [52]:
loans_def_detail_records_enq_df = spark.sql("select member_id, pub_rec, pub_rec_bankruptcies, enquiry_last_6mths from loan_defaulters")

In [53]:
loans_def_detail_records_enq_df #we can see all 3 cols apart from member_id is replaced with zero from NA

member_id,pub_rec,pub_rec_bankruptcies,enquiry_last_6mths
2209556c19f75a204...,1,1,0
9d0d53b8de898b446...,1,1,2
1a978870f4a9b4b43...,0,0,0
c66ccc99ff4372809...,0,0,1
91d00a422248120a4...,0,0,0
b7539a5579e723b21...,0,0,0
2fca945a60ef9246c...,0,0,1
8a6dababd11fd592f...,0,0,2
3af72ce29e878edd7...,0,0,1
4395baca2ea2d58a8...,0,0,0


In [54]:
loans_def_detail_records_enq_df.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv012010/lendingclubproject/cleaned/loans_def_detail_records_enq_df_csv") \
.save()

In [55]:
loans_def_detail_records_enq_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv012010/lendingclubproject/cleaned/loans_def_detail_records_enq_df_parquet") \
.save()