# **Mount Drive**

In [1]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.chdir('/content/drive/My Drive/Colab Notebooks/Big Data Project')

Mounted at /content/drive


In [None]:
# !pip install pyspark

In [81]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round, substring_index, split, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType


In [2]:
import pandas as pd
import datetime
import csv
from tabulate import tabulate



## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [3]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()

In [4]:
sc=spark.sparkContext

## Read Data - SBAnational.csv

In [5]:
data_path="SBAnational.csv"

In [50]:
loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)

In [7]:
loan_df.show(5)
print('=====================')
print("Number of rows in the dataframe:")
loan_df_count = loan_df.count()
print(loan_df_count)
print('=====================')
print("Schema of the dataframe:")
loan_df.printSchema() #prints the dataframe schema
print('=====================')
print("Columns in the dataframe:")
print(loan_df.columns) 

+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|LoanNr_ChkDgt|                Name|        City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|   1000014003|      ABC HOBBYCRAFT|  EVANSVILLE|

# Preprocessing and cleaning

### Report

In [20]:
# =========================================================================
# =========================================================================
# ============================= DF REPORT =================================
# =========================================================================
# =========================================================================
def report_df(df, header):
    # Calculate the total number of rows
    rdd_count = df.count()

    # Initialize lists to store column statistics
    col_names = []
    data_types = []
    unique_samples = []
    num_uniques = []
    nan_percentages = []
    report_data = []

    # Iterate over each column
    for col_name in header:
        print(col_name)
        # Append column name
        col_names.append(col_name)
        selected_col = col(col_name)
        selected_col_df = df.select(selected_col)

        # Determine data type
        dtype = selected_col_df.dtypes[0][1]
        data_types.append(dtype)
        distinct_df = selected_col_df.distinct()
        # Collect unique values
        unique_sample = [row[col_name] for row in distinct_df.limit(2).collect()]
        unique_samples.append(unique_sample)

        # Count number of unique values
        n_unique = distinct_df.count()
        num_uniques.append(n_unique)

        # Calculate percentage of NaN values
        none_percentage_val = df.filter(selected_col.isNull()).count() / rdd_count * 100
        nan_percentages.append(none_percentage_val)
        report_data.append([col_name, dtype, unique_sample, n_unique, none_percentage_val])

    return report_data

In [21]:
report_res = report_df(loan_df, loan_df.columns)


LoanNr_ChkDgt
Name
City
State
Zip
Bank
BankState
NAICS
ApprovalDate
ApprovalFY
Term
NoEmp
NewExist
CreateJob
RetainedJob
FranchiseCode
UrbanRural
RevLineCr
LowDoc
ChgOffDate
DisbursementDate
DisbursementGross
BalanceGross
MIS_Status
ChgOffPrinGr
GrAppv
SBA_Appv
CPU times: total: 0 ns
Wall time: 2min 29s


In [22]:

# Display the result
column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
print(tabulate(report_res, headers=column_names, tablefmt='grid'))

+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Column            | Type   | Unique Sample                                                       |   N Unique |        %None |
| LoanNr_ChkDgt     | bigint | [1000895005, 1001055002]                                            |     899164 |  0           |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Name              | string | ['TURTLE BEACH INN', 'URBAN BEAST-SEATTLE LLC']                     |     779587 |  0.000333643 |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| City              | string | ['Worcester', 'West Sand Lake']                                     |      32582 |  0.00333643  |
+-------------------+--------+-------------------------------------------------------------------

In [65]:
def show_percentage_of_each_value_in_column(df, df_count,col_name):
    # Calculate percentage of 0s and 1s
    percentage_df = df.groupBy(col_name).agg((count("*") / df_count).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # sort the dataframe by percentage descending
    percentage_df = percentage_df.sort(col("Percentage").desc())

    # Show result
    percentage_df.show()

### 1. LoanNr_ChkDgt - ID
Drop the column as it is an ID column and does not provide any information for the analysis.

In [None]:
loan_df = loan_df.drop('LoanNr_ChkDgt')

### 2. Name - Name of Borrower
Drop the column as it is a name column and does not provide any information for the analysis.

In [None]:
loan_df = loan_df.drop('Name')

### 3. City - City of Borrower


In [36]:
# Count the occurrences of each value in city column
col_name = 'City'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------------+----------+
|          City|percentage|
+--------------+----------+
|     Worcester|      0.02|
|West Sand Lake|       0.0|
|     Fairbanks|       0.0|
|        NOVATO|      0.13|
|     NESCONSET|      0.01|
|   MCMINNVILLE|      0.06|
|     WATERTOWN|      0.36|
|      MAYVILLE|      0.05|
|WEST FRANKFORT|      0.03|
|       JOHNSON|      0.03|
|    MCELHATTAN|       0.0|
|       MARLTON|       0.1|
|   WALLED LAKE|      0.04|
|    BRANDYWINE|      0.01|
|       EDMONDS|      0.14|
|     SOUTHLAKE|      0.11|
|         Tyler|       0.0|
|         GALVA|      0.01|
|     PRINCETON|      0.23|
|     SAN PABLO|      0.03|
+--------------+----------+
only showing top 20 rows



In [27]:
# percentage of unique values in the city
city_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in City: {city_unique_count}")
city_percentage = city_unique_count / loan_df_count * 100
print(f"Percentage of unique values in City: {city_percentage:.2f}%")

Number of unique values in City: 32567
Percentage of unique values in City: 3.62%


In [38]:
unique_city_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_city_df.show()

+--------------+------------+
|          City|Unique Count|
+--------------+------------+
|   LOS ANGELES|       11558|
|       HOUSTON|       10247|
|      NEW YORK|        7846|
|       CHICAGO|        6036|
|         MIAMI|        5594|
|     SAN DIEGO|        5363|
|        DALLAS|        5085|
|       PHOENIX|        4493|
|     LAS VEGAS|        4390|
|   SPRINGFIELD|        3738|
|      BROOKLYN|        3728|
|        DENVER|        3550|
|   SAN ANTONIO|        3515|
|SALT LAKE CITY|        3511|
|        AUSTIN|        3499|
|       SEATTLE|        3470|
| SAN FRANCISCO|        3365|
|      PORTLAND|        3193|
|      COLUMBUS|        3186|
|  PHILADELPHIA|        3178|
+--------------+------------+
only showing top 20 rows



### 4. State - State of Borrower

In [30]:
col_name = 'State'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|State|percentage|
+-----+----------+
|   SC|      0.62|
|   AZ|      1.96|
|   LA|      1.06|
|   MN|      2.71|
|   NJ|      2.67|
|   DC|      0.18|
|   OR|      1.23|
|   VA|      1.47|
| NULL|       0.0|
|   RI|       1.0|
|   KY|      0.86|
|   WY|      0.32|
|   NH|      1.34|
|   MI|      2.28|
|   NV|      0.89|
|   WI|      2.34|
|   ID|      1.06|
|   CA|     14.51|
|   CT|      1.35|
|   NE|      0.71|
+-----+----------+
only showing top 20 rows



In [31]:
# percentage of unique values in the city
state_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in State: {state_unique_count}")
state_percentage = state_unique_count / loan_df_count * 100
print(f"Percentage of unique values in State: {state_percentage:.2f}%")

Number of unique values in City: 52
Percentage of unique values in City: 0.01%


### 5. Zip - Zip code of Borrower

In [41]:
col_name = 'Zip'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|  Zip|percentage|
+-----+----------+
|47711|      0.04|
|92644|      0.02|
| 4101|      0.14|
|14450|      0.12|
|43302|      0.05|
|57201|      0.08|
|80033|      0.11|
|18944|      0.04|
|33412|      0.02|
|33602|      0.03|
|75149|      0.06|
|44906|      0.03|
|90019|      0.19|
|29054|       0.0|
|75039|      0.04|
|33569|      0.04|
|81501|       0.1|
|34234|      0.04|
|57039|      0.01|
|20735|      0.06|
+-----+----------+
only showing top 20 rows



In [42]:
# percentage of unique values in the city
zip_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {zip_unique_count}")
zip_percentage = zip_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {zip_percentage:.2f}%")

Number of unique values in City: 33611
Percentage of unique values in City: 20.89%


In [43]:
unique_zip_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_zip_df.show()

+-----+------------+
|  Zip|Unique Count|
+-----+------------+
|10001|         933|
|90015|         926|
|93401|         806|
|90010|         733|
|33166|         671|
|90021|         666|
|59601|         640|
|65804|         599|
| 3801|         581|
|59101|         578|
|84115|         577|
|92121|         567|
|92101|         565|
|90670|         531|
|11354|         530|
|59102|         519|
|85260|         518|
|84107|         513|
|89102|         500|
|92069|         499|
+-----+------------+
only showing top 20 rows



### 6. Bank - Name of the bank that gave the loan

### 7. BankState - State of Bank

### 8. NAICS - North American Industry Classification System code for the industry where the business is located

### 9. ApprovalDate - Date SBA commitment issued

In [52]:
col_name = 'ApprovalDate'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|ApprovalDate|percentage|
+------------+----------+
|   13-May-98|      0.02|
|    5-Sep-03|      0.04|
|    9-Feb-05|      0.05|
|   18-Sep-81|       0.0|
|   23-Jan-79|       0.0|
|   11-May-85|       0.0|
|   15-May-85|       0.0|
|    2-May-80|       0.0|
|   30-Aug-07|      0.03|
|   20-Jan-99|      0.01|
|   26-Jan-99|      0.02|
|   22-Apr-99|      0.02|
|    3-Aug-87|       0.0|
|   17-Jun-99|      0.02|
|   18-Apr-09|       0.0|
|   23-Feb-89|      0.01|
|    2-Jun-89|      0.01|
|   14-Jul-09|      0.01|
|   10-Mar-00|      0.02|
|   16-Jun-00|      0.02|
+------------+----------+
only showing top 20 rows



In [62]:
# the full date has too much detail, so we will extract the month only
# first, remove the nulls
col_name = 'ApprovalDate'
print(f"Number of rows before removing nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing nulls: {loan_df_count}")
# split on '-', get the second element
df = loan_df.withColumn("ApprovalMonth", split(col(col_name), "-")[1])
# loan_df = loan_df.drop(col_name)
col_name = 'ApprovalMonth'
show_percentage_of_each_value_in_column(df, loan_df_count,"ApprovalMonth")

Number of rows before removing nulls: 899164
Number of rows after removing nulls: 899164
+-------------+----------+
|ApprovalMonth|percentage|
+-------------+----------+
|          Oct|      7.76|
|          Sep|      9.24|
|          Dec|      7.78|
|          Aug|      8.76|
|          May|      8.59|
|          Jun|      8.71|
|          Feb|      7.38|
|          Nov|      7.61|
|          Mar|       9.3|
|          Jan|      7.46|
|          Apr|      8.92|
|          Jul|      8.51|
+-------------+----------+



### 10. ApprovalFY - Fiscal Year of commitment
Drop the column as it is a date column and does not provide any information for the analysis.

In [None]:
loan_df = loan_df.drop('ApprovalFY')

### 11. Term - Loan term in months

In [63]:
col_name = 'Term'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----+----------+
|Term|percentage|
+----+----------+
| 148|      0.02|
| 243|      0.06|
|  31|      0.22|
| 137|      0.02|
|  85|       0.2|
| 251|      0.01|
|  65|      0.27|
|  53|      0.27|
| 255|      0.02|
| 481|       0.0|
| 133|      0.02|
| 296|      0.01|
|  78|       0.3|
| 322|       0.0|
| 321|       0.0|
| 362|       0.0|
| 375|       0.0|
| 108|      0.44|
| 155|      0.02|
|  34|      0.23|
+----+----------+
only showing top 20 rows



### 12. NoEmp - Number of Business Employees

In [66]:
col_name = 'NoEmp'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|NoEmp|Percentage|
+-----+----------+
|    1|     17.16|
|    2|     15.38|
|    3|     10.08|
|    4|      8.19|
|    5|      6.71|
|    6|      5.09|
|   10|      3.51|
|    7|       3.5|
|    8|      3.49|
|   12|      2.32|
|   15|      2.04|
|    9|      2.02|
|   20|      1.59|
|   11|      1.31|
|   14|      1.19|
|   25|      1.11|
|   13|      1.04|
|   30|      0.96|
|   16|      0.87|
|   18|      0.87|
+-----+----------+
only showing top 20 rows



### 13. NewExist - 1 = Existing business, 2 = New business 

In [67]:
col_name = 'NewExist'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.72|
|       2|     28.15|
|       0|      0.11|
|    NULL|      0.02|
+--------+----------+



In [68]:
# drop rows with 0 or Null
col_name = 'NewExist'
print(f"Number of rows before removing 0s and nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name] != 0)
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing 0s and nulls: {loan_df_count}")

Number of rows before removing 0s and nulls: 899164
Number of rows after removing 0s and nulls: 897994


In [69]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.81|
|       2|     28.19|
+--------+----------+



### 14. CreateJob - Number of jobs created

### 15. RetainedJob - Number of jobs retained

### 16. FranchiseCode - Franchise code, (00000 or 00001) = No franchise

### 17. UrbanRural - 1 = Urban, 2 = rural, 0 = undefined

In [70]:
col_name = 'UrbanRural'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----------+----------+
|UrbanRural|Percentage|
+----------+----------+
|         1|     52.36|
|         0|     35.93|
|         2|     11.71|
+----------+----------+



### 18. RevLineCr - Revolving line of credit: Y = Yes, N = No

In [71]:
col_name = 'RevLineCr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     46.69|
|        0|     28.68|
|        Y|     22.42|
|        T|       1.7|
|     NULL|       0.5|
|        7|       0.0|
|        3|       0.0|
|        Q|       0.0|
|        5|       0.0|
|        .|       0.0|
|        C|       0.0|
|        -|       0.0|
|        A|       0.0|
|        R|       0.0|
|        1|       0.0|
|        `|       0.0|
|        ,|       0.0|
|        4|       0.0|
|        2|       0.0|
+---------+----------+



In [72]:
# filter only N and Y
col_name = 'RevLineCr'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 897994
Number of rows after filtering: 620582
+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     67.56|
|        Y|     32.44|
+---------+----------+



### 19. LowDoc - LowDoc Loan Program: Y = Yes, N = No

In [73]:
col_name = "LowDoc"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     89.94|
|     Y|      9.24|
|  NULL|      0.41|
|     0|      0.19|
|     C|      0.11|
|     S|      0.09|
|     A|      0.02|
|     R|      0.01|
|     1|       0.0|
+------+----------+



In [74]:
# filter only N and Y
col_name = 'LowDoc'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 620582
Number of rows after filtering: 615482
+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     90.69|
|     Y|      9.31|
+------+----------+



### 20. ChgOffDate - The date when a loan is declared to be in default
Drop the column due to the high number of missing values.

In [10]:
loan_df = loan_df.drop('ChgOffDate')

### 21. DisbursementDate - Date when loan was disbursed

In [None]:
loan_df = loan_df.drop('DisbursementDate')

### 22. DisbursementGross - Amount disbursed

In [None]:
loan_df = loan_df.drop('DisbursementGross')

### 23. BalanceGross - Gross amount outstanding

In [75]:
col_name = 'BalanceGross'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|BalanceGross|Percentage|
+------------+----------+
|      $0.00 |     100.0|
| $25,000.00 |       0.0|
|  $1,760.00 |       0.0|
| $84,617.00 |       0.0|
| $37,100.00 |       0.0|
|$827,875.00 |       0.0|
| $43,127.00 |       0.0|
|$996,262.00 |       0.0|
+------------+----------+



Drop as most of the values are 0

In [76]:
loan_df = loan_df.drop('BalanceGross')

### 24. MIS_Status - Target variable

Delete rows that have null target value (MIS_Status)

In [11]:
col_name ="MIS_Status"
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|      NULL|      0.22|
|     P I F|     82.26|
|    CHGOFF|     17.52|
+----------+----------+



In [12]:
# drop rows with null values in MIS_Status column
loan_df = loan_df.dropna(subset=[col_name])
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)


+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|     P I F|     82.26|
|    CHGOFF|     17.52|
+----------+----------+



### Replace target values with 0 and 1
Target value column is: MIS_Status
"P I F" = 1
"CHGOFF" = 0

In [13]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "P I F", 1)
                   .otherwise(0)
                   .cast("int"))

Show the percentage of:
- Paid in full loans (approved loans), MIS_Status = 1
- Charged off loans (rejected loans), MIS_Status = 0

In [None]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

### 25. ChgOffPrinGr - Charged-off amount

In [79]:
col_name = 'ChgOffPrinGr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `ChgOffPrinGr` cannot be resolved. Did you mean one of the following? [`ChgOffDate`, `Bank`, `City`, `NewExist`, `RevLineCr`].;
'Aggregate ['ChgOffPrinGr], ['ChgOffPrinGr, (cast(count(1) as double) / cast(615482 as double)) AS Percentage#3572]
+- Project [LoanNr_ChkDgt#2254L, Name#2255, City#2256, State#2257, Zip#2258, Bank#2259, BankState#2260, NAICS#2261, ApprovalDate#2262, ApprovalFY#2263, Term#2264, NoEmp#2265, NewExist#2266, CreateJob#2267, RetainedJob#2268, FranchiseCode#2269, UrbanRural#2270, RevLineCr#2271, LowDoc#2272, ChgOffDate#2273, DisbursementDate#2274, DisbursementGross#2275, MIS_Status#2277, GrAppv#2279, SBA_Appv#2280]
   +- Project [LoanNr_ChkDgt#2254L, Name#2255, City#2256, State#2257, Zip#2258, Bank#2259, BankState#2260, NAICS#2261, ApprovalDate#2262, ApprovalFY#2263, Term#2264, NoEmp#2265, NewExist#2266, CreateJob#2267, RetainedJob#2268, FranchiseCode#2269, UrbanRural#2270, RevLineCr#2271, LowDoc#2272, ChgOffDate#2273, DisbursementDate#2274, DisbursementGross#2275, MIS_Status#2277, ChgOffPrinGr#2278, ... 2 more fields]
      +- Filter LowDoc#2272 IN (N,Y)
         +- Filter RevLineCr#2271 IN (N,Y)
            +- Filter isnotnull(NewExist#2266)
               +- Filter NOT (NewExist#2266 = 0)
                  +- Filter isnotnull(ApprovalDate#2262)
                     +- Filter isnotnull(ApprovalDate#2262)
                        +- Filter isnotnull(ApprovalDate#2262)
                           +- Filter isnotnull(ApprovalDate#2262)
                              +- Filter isnotnull(ApprovalDate#2262)
                                 +- Filter isnotnull(ApprovalDate#2262)
                                    +- Filter isnotnull(ApprovalDate#2262)
                                       +- Relation [LoanNr_ChkDgt#2254L,Name#2255,City#2256,State#2257,Zip#2258,Bank#2259,BankState#2260,NAICS#2261,ApprovalDate#2262,ApprovalFY#2263,Term#2264,NoEmp#2265,NewExist#2266,CreateJob#2267,RetainedJob#2268,FranchiseCode#2269,UrbanRural#2270,RevLineCr#2271,LowDoc#2272,ChgOffDate#2273,DisbursementDate#2274,DisbursementGross#2275,BalanceGross#2276,MIS_Status#2277,... 3 more fields] csv


Drop this column as it will leak info to the column, because if the value is 0, this means that the loan is charged off

In [None]:
loan_df = loan_df.drop('ChgOffPrinGr')

### 26. GrAppv - Gross amount of loan approved by bank

In [80]:
col_name = "GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|      GrAppv|Percentage|
+------------+----------+
| $50,000.00 |      8.78|
| $25,000.00 |      6.94|
|$100,000.00 |      6.09|
| $10,000.00 |      4.94|
| $20,000.00 |      3.15|
| $35,000.00 |      3.01|
| $30,000.00 |      2.65|
| $15,000.00 |      2.37|
|$150,000.00 |      2.32|
| $75,000.00 |      2.17|
| $40,000.00 |      1.75|
|  $5,000.00 |      1.65|
|$200,000.00 |      1.49|
|$250,000.00 |      1.33|
| $60,000.00 |       1.2|
|$300,000.00 |      0.91|
| $80,000.00 |      0.88|
|$500,000.00 |      0.86|
| $45,000.00 |      0.85|
| $70,000.00 |      0.78|
+------------+----------+
only showing top 20 rows



In [82]:
# remove $, replace , by ''
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("GrAppv", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("clean_GrAppv", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_GrAppv", col("clean_GrAppv").cast("float"))
col_name = "clean_GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|clean_GrAppv|Percentage|
+------------+----------+
|     50000.0|      8.78|
|     25000.0|      6.94|
|    100000.0|      6.09|
|     10000.0|      4.94|
|     20000.0|      3.15|
|     35000.0|      3.01|
|     30000.0|      2.65|
|     15000.0|      2.37|
|    150000.0|      2.32|
|     75000.0|      2.17|
|     40000.0|      1.75|
|      5000.0|      1.65|
|    200000.0|      1.49|
|    250000.0|      1.33|
|     60000.0|       1.2|
|    300000.0|      0.91|
|     80000.0|      0.88|
|    500000.0|      0.86|
|     45000.0|      0.85|
|     70000.0|      0.78|
+------------+----------+
only showing top 20 rows



### 27. SBA_Appv - SBA's guaranteed amount of approved loan
Drop as we don't know this amount in the future

In [83]:
loan_df = loan_df.drop('SBA_Appv')

In [84]:
loan_df.printSchema()

root
 |-- LoanNr_ChkDgt: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- NAICS: integer (nullable = true)
 |-- ApprovalDate: string (nullable = true)
 |-- ApprovalFY: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- FranchiseCode: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: string (nullable = true)
 |-- LowDoc: string (nullable = true)
 |-- ChgOffDate: string (nullable = true)
 |-- DisbursementDate: string (nullable = true)
 |-- DisbursementGross: string (nullable = true)
 |-- MIS_Status: string (nullable = true)
 |-- GrAppv: string (nullable = true)
 |-- clean_GrAppv

In [15]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, 'MIS_Status')

+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|         1|     82.26|
|         0|     17.52|
+----------+----------+



### Check duplicated rows based on all columns


In [16]:
# sum of duplicate rows
print("Number of duplicate rows in the dataframe:")
loan_df_duplicates = loan_df_count - loan_df.dropDuplicates().count()
print(loan_df_duplicates)

Number of duplicate rows in the dataframe:
0


# Exploratory Data Analysis (EDA)

In [23]:
def feature_vs_target(df, feature):
    target = "MIS_Status"
    # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
    percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("percentage", round(col("percentage") * 100, 2))

    # Show result
    percentage_df.show()

In [24]:
feature_vs_target(loan_df, "NewExist")

+--------+----------+----------+
|NewExist|MIS_Status|percentage|
+--------+----------+----------+
|       1|     P I F|     59.31|
|       0|     P I F|      0.11|
|    NULL|      NULL|       0.0|
|       1|      NULL|      0.16|
|    NULL|     P I F|      0.01|
|       2|    CHGOFF|      5.27|
|    NULL|    CHGOFF|       0.0|
|       0|    CHGOFF|      0.01|
|       2|     P I F|     22.82|
|       2|      NULL|      0.06|
|       0|      NULL|       0.0|
|       1|    CHGOFF|     12.25|
+--------+----------+----------+



## Apply Map

## Apply ReduceByKey

## Save