# **Mount Drive**

In [1]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.chdir('/content/drive/My Drive/Colab Notebooks/Big Data Project')

In [2]:
# !pip install pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round, substring_index, split, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType
import plotly.graph_objects as go


In [2]:
import pandas as pd
import datetime
import csv
from tabulate import tabulate



## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [3]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()

In [4]:
sc=spark.sparkContext

## Read Data - SBAnational.csv

In [5]:
data_path="SBAnational.csv"

In [6]:
loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)

In [7]:
loan_df.show(5)
print('=====================')
print("Number of rows in the dataframe:")
loan_df_count = loan_df.count()
print(loan_df_count)
print('=====================')
print("Schema of the dataframe:")
loan_df.printSchema() #prints the dataframe schema
print('=====================')
print("Columns in the dataframe:")
print(loan_df.columns) 

+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|LoanNr_ChkDgt|                Name|        City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|   1000014003|      ABC HOBBYCRAFT|  EVANSVILLE|

# Preprocessing and cleaning

### Report

In [10]:
# =========================================================================
# =========================================================================
# ============================= DF REPORT =================================
# =========================================================================
# =========================================================================
def report_df(df, header):
    # Calculate the total number of rows
    rdd_count = df.count()

    # Initialize lists to store column statistics
    col_names = []
    data_types = []
    unique_samples = []
    num_uniques = []
    nan_percentages = []
    report_data = []

    # Iterate over each column
    for col_name in header:
        print(col_name)
        # Append column name
        col_names.append(col_name)
        selected_col = col(col_name)
        selected_col_df = df.select(selected_col)

        # Determine data type
        dtype = selected_col_df.dtypes[0][1]
        data_types.append(dtype)
        distinct_df = selected_col_df.distinct()
        # Collect unique values
        unique_sample = [row[col_name] for row in distinct_df.limit(2).collect()]
        unique_samples.append(unique_sample)

        # Count number of unique values
        n_unique = distinct_df.count()
        num_uniques.append(n_unique)

        # Calculate percentage of NaN values
        none_percentage_val = df.filter(selected_col.isNull()).count() / rdd_count * 100
        nan_percentages.append(none_percentage_val)
        report_data.append([col_name, dtype, unique_sample, n_unique, none_percentage_val])

    return report_data

In [11]:
report_res = report_df(loan_df, loan_df.columns)


LoanNr_ChkDgt
Name


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\bemoi\miniconda3\envs\v38_env\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\Users\bemoi\miniconda3\envs\v38_env\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "c:\Users\bemoi\miniconda3\envs\v38_env\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [12]:

# Display the result
column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
print(tabulate(report_res, headers=column_names, tablefmt='grid'))

NameError: name 'report_res' is not defined

In [8]:
def show_percentage_of_each_value_in_column(df, df_count,col_name):
    # Calculate percentage of 0s and 1s
    percentage_df = df.groupBy(col_name).agg((count("*") / df_count).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # sort the dataframe by percentage descending
    percentage_df = percentage_df.sort(col("Percentage").desc())

    # Show result
    percentage_df.show()

### 1. LoanNr_ChkDgt - ID
Drop the column as it is an ID column and does not provide any information for the analysis.

In [9]:
loan_df = loan_df.drop('LoanNr_ChkDgt')

### 2. Name - Name of Borrower
Drop the column as it is a name column and does not provide any information for the analysis.

In [10]:
loan_df = loan_df.drop('Name')

### 3. City - City of Borrower


In [11]:
# Count the occurrences of each value in city column
col_name = 'City'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------------+----------+
|          City|Percentage|
+--------------+----------+
|   LOS ANGELES|      1.29|
|       HOUSTON|      1.14|
|      NEW YORK|      0.87|
|       CHICAGO|      0.67|
|         MIAMI|      0.62|
|     SAN DIEGO|       0.6|
|        DALLAS|      0.57|
|       PHOENIX|       0.5|
|     LAS VEGAS|      0.49|
|   SPRINGFIELD|      0.42|
|      BROOKLYN|      0.41|
|   SAN ANTONIO|      0.39|
|        AUSTIN|      0.39|
|        DENVER|      0.39|
|       SEATTLE|      0.39|
|SALT LAKE CITY|      0.39|
| SAN FRANCISCO|      0.37|
|      PORTLAND|      0.36|
|       ATLANTA|      0.35|
|      COLUMBUS|      0.35|
+--------------+----------+
only showing top 20 rows



In [12]:
# percentage of unique values in the city
city_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in City: {city_unique_count}")
city_percentage = city_unique_count / loan_df_count * 100
print(f"Percentage of unique values in City: {city_percentage:.2f}%")

Number of unique values in City: 32582
Percentage of unique values in City: 3.62%


In [13]:
unique_city_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_city_df.show()

+--------------+-----+
|          City|Count|
+--------------+-----+
|   LOS ANGELES|11558|
|       HOUSTON|10247|
|      NEW YORK| 7846|
|       CHICAGO| 6036|
|         MIAMI| 5594|
|     SAN DIEGO| 5363|
|        DALLAS| 5085|
|       PHOENIX| 4493|
|     LAS VEGAS| 4390|
|   SPRINGFIELD| 3738|
|      BROOKLYN| 3728|
|        DENVER| 3550|
|   SAN ANTONIO| 3515|
|SALT LAKE CITY| 3511|
|        AUSTIN| 3499|
|       SEATTLE| 3470|
| SAN FRANCISCO| 3365|
|      PORTLAND| 3193|
|      COLUMBUS| 3186|
|  PHILADELPHIA| 3178|
+--------------+-----+
only showing top 20 rows



### 4. State - State of Borrower

In [14]:
col_name = 'State'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|State|Percentage|
+-----+----------+
|   CA|     14.53|
|   TX|      7.84|
|   NY|      6.42|
|   FL|      4.58|
|   PA|      3.91|
|   OH|      3.63|
|   IL|       3.3|
|   MA|      2.81|
|   MN|      2.71|
|   NJ|      2.67|
|   WA|      2.59|
|   GA|      2.48|
|   WI|      2.34|
|   MO|       2.3|
|   CO|      2.29|
|   MI|      2.28|
|   UT|      2.09|
|   AZ|      1.96|
|   NC|      1.59|
|   IN|      1.57|
+-----+----------+
only showing top 20 rows



In [15]:
# percentage of unique values in the city
state_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in State: {state_unique_count}")
state_percentage = state_unique_count / loan_df_count * 100
print(f"Percentage of unique values in State: {state_percentage:.2f}%")

Number of unique values in State: 52
Percentage of unique values in State: 0.01%


### 5. Zip - Zip code of Borrower

In [16]:
col_name = 'Zip'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|  Zip|Percentage|
+-----+----------+
|10001|       0.1|
|90015|       0.1|
|93401|      0.09|
|90010|      0.08|
|33166|      0.07|
|90021|      0.07|
|59601|      0.07|
|65804|      0.07|
|84107|      0.06|
|85260|      0.06|
|92069|      0.06|
|84115|      0.06|
|90670|      0.06|
|11354|      0.06|
|92121|      0.06|
|89102|      0.06|
|92101|      0.06|
| 3801|      0.06|
|59101|      0.06|
|59102|      0.06|
+-----+----------+
only showing top 20 rows



In [17]:
# percentage of unique values in the city
zip_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {zip_unique_count}")
zip_percentage = zip_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {zip_percentage:.2f}%")

Number of unique values in Zip: 33611
Percentage of unique values in Zip: 3.74%


In [18]:
unique_zip_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_zip_df.show()

+-----+-----+
|  Zip|Count|
+-----+-----+
|10001|  933|
|90015|  926|
|93401|  806|
|90010|  733|
|33166|  671|
|90021|  666|
|59601|  640|
|65804|  599|
| 3801|  581|
|59101|  578|
|84115|  577|
|92121|  567|
|92101|  565|
|90670|  531|
|11354|  530|
|59102|  519|
|85260|  518|
|84107|  513|
|89102|  500|
|92069|  499|
+-----+-----+
only showing top 20 rows



### 6. Bank - Name of the bank that gave the loan

### 7. BankState - State of Bank

### 8. NAICS - North American Industry Classification System code for the industry where the business is located

### 9. ApprovalDate - Date SBA commitment issued

In [19]:
col_name = 'ApprovalDate'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|ApprovalDate|Percentage|
+------------+----------+
|    7-Jul-93|      0.13|
|   30-Jan-04|      0.11|
|    8-Jul-93|      0.09|
|   30-Sep-03|      0.07|
|    4-Oct-04|      0.07|
|   21-Jan-05|      0.06|
|   30-Jun-05|      0.06|
|   18-Apr-05|      0.06|
|    6-Jul-93|      0.06|
|   27-Sep-02|      0.06|
|    7-Nov-03|      0.05|
|   24-Mar-06|      0.05|
|   22-Aug-03|      0.05|
|    6-Dec-04|      0.05|
|   21-Feb-03|      0.05|
|    9-Feb-05|      0.05|
|   14-Nov-03|      0.05|
|   18-Jun-04|      0.05|
|   30-Sep-02|      0.05|
|   23-Mar-95|      0.05|
+------------+----------+
only showing top 20 rows



In [62]:
# the full date has too much detail, so we will extract the month only
# first, remove the nulls
col_name = 'ApprovalDate'
print(f"Number of rows before removing nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing nulls: {loan_df_count}")
# split on '-', get the second element
loan_df = loan_df.withColumn("ApprovalMonth", split(col(col_name), "-")[1])
# loan_df = loan_df.drop(col_name)
col_name = 'ApprovalMonth'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,"ApprovalMonth")

Number of rows before removing nulls: 613723


Number of rows after removing nulls: 613723
+-------------+----------+
|ApprovalMonth|Percentage|
+-------------+----------+
|          Sep|      9.27|
|          Mar|      9.18|
|          Apr|       9.0|
|          Aug|       8.8|
|          Jul|      8.69|
|          Jun|      8.55|
|          May|      8.54|
|          Oct|      7.84|
|          Dec|      7.68|
|          Nov|      7.68|
|          Jan|      7.59|
|          Feb|      7.18|
+-------------+----------+



### 10. ApprovalFY - Fiscal Year of commitment
Drop the column as it is a date column and does not provide any information for the analysis.

In [21]:
loan_df = loan_df.drop('ApprovalFY')

### 11. Term - Loan term in months

In [22]:
col_name = 'Term'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----+----------+
|Term|Percentage|
+----+----------+
|  84|      25.6|
|  60|      10.0|
| 240|      9.56|
| 120|      8.64|
| 300|      4.97|
| 180|      3.13|
|  36|       2.2|
|  12|       1.9|
|  48|      1.74|
|  72|      1.05|
|  24|      0.83|
|  96|       0.8|
|  90|      0.76|
|  66|      0.58|
|  87|      0.55|
|  63|      0.51|
|  83|      0.46|
| 108|      0.44|
| 144|      0.36|
|   6|      0.34|
+----+----------+
only showing top 20 rows



### 12. NoEmp - Number of Business Employees

In [23]:
col_name = 'NoEmp'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|NoEmp|Percentage|
+-----+----------+
|    1|     17.16|
|    2|     15.38|
|    3|     10.08|
|    4|      8.19|
|    5|      6.71|
|    6|      5.09|
|   10|      3.51|
|    7|       3.5|
|    8|      3.49|
|   12|      2.32|
|   15|      2.04|
|    9|      2.02|
|   20|      1.59|
|   11|      1.31|
|   14|      1.19|
|   25|      1.11|
|   13|      1.04|
|   30|      0.96|
|   16|      0.87|
|   18|      0.87|
+-----+----------+
only showing top 20 rows



### 13. NewExist - 1 = Existing business, 2 = New business 

In [24]:
col_name = 'NewExist'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.72|
|       2|     28.15|
|       0|      0.11|
|    NULL|      0.02|
+--------+----------+



Drop rows with 0 or Null

In [25]:
col_name = 'NewExist'
print(f"Number of rows before removing 0s and nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name] != 0)
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing 0s and nulls: {loan_df_count}")

Number of rows before removing 0s and nulls: 899164


Number of rows after removing 0s and nulls: 897994


In [26]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.81|
|       2|     28.19|
+--------+----------+



### 14. CreateJob - Number of jobs created

### 15. RetainedJob - Number of jobs retained

### 16. FranchiseCode - Franchise code, (00000 or 00001) = No franchise

In [64]:
col_name='FranchiseCode'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-------------+----------+
|FranchiseCode|Percentage|
+-------------+----------+
|            1|     63.41|
|            0|     32.38|
|        78760|      0.26|
|        21780|       0.1|
|        50564|      0.07|
|        22470|      0.06|
|        79140|      0.06|
|        68020|      0.06|
|        24850|      0.05|
|        67750|      0.05|
|        25650|      0.05|
|        10465|      0.04|
|        49952|      0.03|
|        68250|      0.03|
|        72590|      0.03|
|        26650|      0.03|
|         1560|      0.03|
|        10528|      0.03|
|        52875|      0.03|
|        38605|      0.03|
+-------------+----------+
only showing top 20 rows



We don't care about the franchise code, we only care if there is a franchise or not

In [65]:
# make 0 or 1 = 0, anything else = 1
loan_df = loan_df.withColumn("IsFranchise", when((col("FranchiseCode") == 0) | (col("FranchiseCode") == 1), 0).otherwise(1))

In [67]:
col_name = 'IsFranchise'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----------+----------+
|IsFranchise|Percentage|
+-----------+----------+
|          0|     95.79|
|          1|      4.21|
+-----------+----------+



### 17. UrbanRural - 1 = Urban, 2 = rural, 0 = undefined

In [27]:
col_name = 'UrbanRural'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----------+----------+
|UrbanRural|Percentage|
+----------+----------+
|         1|     52.36|
|         0|     35.93|
|         2|     11.71|
+----------+----------+



### 18. RevLineCr - Revolving line of credit: Y = Yes, N = No

In [28]:
col_name = 'RevLineCr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     46.69|
|        0|     28.68|
|        Y|     22.42|
|        T|       1.7|
|     NULL|       0.5|
|        7|       0.0|
|        3|       0.0|
|        Q|       0.0|
|        5|       0.0|
|        .|       0.0|
|        C|       0.0|
|        -|       0.0|
|        A|       0.0|
|        R|       0.0|
|        1|       0.0|
|        `|       0.0|
|        ,|       0.0|
|        4|       0.0|
|        2|       0.0|
+---------+----------+



Filter only N and Y

In [29]:
col_name = 'RevLineCr'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 897994
Number of rows after filtering: 620582
+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     67.56|
|        Y|     32.44|
+---------+----------+



### 19. LowDoc - LowDoc Loan Program: Y = Yes, N = No

In [30]:
col_name = "LowDoc"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     89.94|
|     Y|      9.24|
|  NULL|      0.41|
|     0|      0.19|
|     C|      0.11|
|     S|      0.09|
|     A|      0.02|
|     R|      0.01|
|     1|       0.0|
+------+----------+



Filter only N and Y

In [31]:
col_name = 'LowDoc'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 620582


Number of rows after filtering: 615482
+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     90.69|
|     Y|      9.31|
+------+----------+



### 20. ChgOffDate - The date when a loan is declared to be in default
Drop the column due to the high number of missing values.

In [32]:
loan_df = loan_df.drop('ChgOffDate')

### 21. DisbursementDate - Date when loan was disbursed

In [33]:
loan_df = loan_df.drop('DisbursementDate')

### 22. DisbursementGross - Amount disbursed

In [34]:
loan_df = loan_df.drop('DisbursementGross')

### 23. BalanceGross - Gross amount outstanding

In [35]:
col_name = 'BalanceGross'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|BalanceGross|Percentage|
+------------+----------+
|      $0.00 |     100.0|
| $25,000.00 |       0.0|
|  $1,760.00 |       0.0|
| $84,617.00 |       0.0|
| $37,100.00 |       0.0|
|$827,875.00 |       0.0|
| $43,127.00 |       0.0|
|$996,262.00 |       0.0|
+------------+----------+



Drop as most of the values are 0

In [36]:
loan_df = loan_df.drop('BalanceGross')

### 24. MIS_Status - Target variable

Delete rows that have null target value (MIS_Status)

In [37]:
col_name ="MIS_Status"
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.69|
|    CHGOFF|     18.02|
|      NULL|      0.29|
+----------+----------+



In [38]:
# drop rows with null values in MIS_Status column
loan_df = loan_df.dropna(subset=[col_name])
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)


+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.69|
|    CHGOFF|     18.02|
+----------+----------+



### Replace target values with 0 and 1
Target value column is: MIS_Status
"P I F" = 1
"CHGOFF" = 0

In [39]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "P I F", 1)
                   .otherwise(0)
                   .cast("int"))

Show the percentage of:
- Paid in full loans (approved loans), MIS_Status = 1
- Charged off loans (rejected loans), MIS_Status = 0

In [40]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|         1|     81.69|
|         0|     18.02|
+----------+----------+



### 25. ChgOffPrinGr - Charged-off amount

In [41]:
col_name = 'ChgOffPrinGr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|ChgOffPrinGr|Percentage|
+------------+----------+
|      $0.00 |     81.28|
| $50,000.00 |      0.29|
| $10,000.00 |      0.22|
| $25,000.00 |      0.21|
| $35,000.00 |      0.18|
|$100,000.00 |      0.14|
| $20,000.00 |      0.08|
| $30,000.00 |      0.07|
| $15,000.00 |      0.06|
|  $5,000.00 |      0.05|
| $75,000.00 |      0.04|
| $40,000.00 |      0.03|
| $45,000.00 |      0.02|
| $24,999.00 |      0.02|
| $28,000.00 |      0.01|
| $12,500.00 |      0.01|
| $95,000.00 |      0.01|
| $24,000.00 |      0.01|
| $49,999.00 |      0.01|
|$250,000.00 |      0.01|
+------------+----------+
only showing top 20 rows



Drop this column as it will leak info to the column, because if the value is 0, this means that the loan is charged off

In [42]:
loan_df = loan_df.drop('ChgOffPrinGr')

### 26. GrAppv - Gross amount of loan approved by bank

In [43]:
col_name = "GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|      GrAppv|Percentage|
+------------+----------+
| $50,000.00 |      8.78|
| $25,000.00 |      6.93|
|$100,000.00 |      6.09|
| $10,000.00 |      4.93|
| $20,000.00 |      3.14|
| $35,000.00 |       3.0|
| $30,000.00 |      2.65|
| $15,000.00 |      2.37|
|$150,000.00 |      2.32|
| $75,000.00 |      2.16|
| $40,000.00 |      1.75|
|  $5,000.00 |      1.61|
|$200,000.00 |      1.49|
|$250,000.00 |      1.33|
| $60,000.00 |       1.2|
|$300,000.00 |      0.91|
| $80,000.00 |      0.88|
|$500,000.00 |      0.86|
| $45,000.00 |      0.85|
| $70,000.00 |      0.78|
+------------+----------+
only showing top 20 rows



#### Clean this column
- Remove $
- Remove ,
- Convert to float

In [44]:
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("GrAppv", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("clean_GrAppv", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_GrAppv", col("clean_GrAppv").cast("float"))
col_name = "clean_GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|clean_GrAppv|Percentage|
+------------+----------+
|     50000.0|      8.78|
|     25000.0|      6.93|
|    100000.0|      6.09|
|     10000.0|      4.93|
|     20000.0|      3.14|
|     35000.0|       3.0|
|     30000.0|      2.65|
|     15000.0|      2.37|
|    150000.0|      2.32|
|     75000.0|      2.16|
|     40000.0|      1.75|
|      5000.0|      1.61|
|    200000.0|      1.49|
|    250000.0|      1.33|
|     60000.0|       1.2|
|    300000.0|      0.91|
|     80000.0|      0.88|
|    500000.0|      0.86|
|     45000.0|      0.85|
|     70000.0|      0.78|
+------------+----------+
only showing top 20 rows



### 27. SBA_Appv - SBA's guaranteed amount of approved loan
Drop as we don't know this amount in the future

In [45]:
loan_df = loan_df.drop('SBA_Appv')

### Final schema

In [46]:
loan_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- NAICS: integer (nullable = true)
 |-- ApprovalDate: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- FranchiseCode: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: string (nullable = true)
 |-- LowDoc: string (nullable = true)
 |-- MIS_Status: integer (nullable = false)
 |-- GrAppv: string (nullable = true)
 |-- clean_GrAppv: float (nullable = true)



### Check duplicated rows based on all columns


In [47]:
print("Number of duplicate rows in the dataframe:")
loan_df_duplicates = loan_df_count - loan_df.dropDuplicates().count()
print(loan_df_duplicates)

Number of duplicate rows in the dataframe:


1991


### Final DF Count

In [48]:
loan_df_count = loan_df.count()
print(f"Final DF count: {loan_df_count}")

Final DF count: 613723


# Exploratory Data Analysis (EDA)

In [56]:
def feature_vs_target(df, feature):
    target = "MIS_Status"
    # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
    percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # Show result
    percentage_df.show()

    # Convert DataFrame to list
    data = percentage_df.collect()

    # Separate data by target
    data_0 = [row['Percentage'] for row in data if row[target] == 0]
    data_1 = [row['Percentage'] for row in data if row[target] == 1]

    # Create traces
    trace0 = go.Bar(x=[row[feature] for row in data if row[target] == 0], y=data_0, name='0', marker_color='red')
    trace1 = go.Bar(x=[row[feature] for row in data if row[target] == 1], y=data_1, name='1', marker_color='blue')

    # Create layout
    layout = go.Layout(barmode='stack', title='Percentage Distribution of ' + feature + ' vs ' + target)

    # Create figure and add traces
    fig = go.Figure(data=[trace0, trace1], layout=layout)

    # Plot
    fig.show()

### 9. ApprovalMonth

In [63]:
feature_vs_target(loan_df,"ApprovalMonth")

+-------------+----------+----------+
|ApprovalMonth|MIS_Status|Percentage|
+-------------+----------+----------+
|          Feb|         1|      5.76|
|          May|         1|      7.01|
|          Oct|         1|      6.39|
|          Dec|         1|      6.28|
|          Sep|         1|      7.73|
|          Dec|         0|       1.4|
|          Nov|         1|      6.23|
|          Mar|         0|       1.7|
|          Nov|         0|      1.45|
|          Feb|         0|      1.42|
|          Oct|         0|      1.45|
|          Apr|         0|      1.56|
|          Jun|         0|      1.54|
|          Jan|         0|       1.4|
|          May|         0|      1.52|
|          Mar|         1|      7.48|
|          Sep|         0|      1.54|
|          Jul|         0|      1.48|
|          Aug|         0|      1.61|
|          Apr|         1|      7.44|
+-------------+----------+----------+
only showing top 20 rows



### 13. NewExist

In [57]:
feature_vs_target(loan_df, "NewExist")

+--------+----------+----------+
|NewExist|MIS_Status|Percentage|
+--------+----------+----------+
|       1|         0|     13.11|
|       1|         1|     59.83|
|       2|         1|     22.09|
|       2|         0|      4.96|
+--------+----------+----------+



### 16. IsFranchise

In [68]:
feature_vs_target(loan_df, "IsFranchise")

+-----------+----------+----------+
|IsFranchise|MIS_Status|Percentage|
+-----------+----------+----------+
|          1|         0|      0.49|
|          1|         1|      3.71|
|          0|         0|     17.58|
|          0|         1|     78.21|
+-----------+----------+----------+



### 17. UrbanRural

In [58]:
feature_vs_target(loan_df, "UrbanRural")

+----------+----------+----------+
|UrbanRural|MIS_Status|Percentage|
+----------+----------+----------+
|         1|         0|     13.26|
|         1|         1|     34.68|
|         0|         0|      2.65|
|         2|         1|      8.44|
|         0|         1|      38.8|
|         2|         0|      2.17|
+----------+----------+----------+



### 18. RevLineCr

In [59]:
feature_vs_target(loan_df, "RevLineCr")

+---------+----------+----------+
|RevLineCr|MIS_Status|Percentage|
+---------+----------+----------+
|        N|         1|     57.66|
|        Y|         1|     24.26|
|        Y|         0|      8.27|
|        N|         0|      9.81|
+---------+----------+----------+



### 19. LowDoc

In [60]:
feature_vs_target(loan_df, "LowDoc")

+------+----------+----------+
|LowDoc|MIS_Status|Percentage|
+------+----------+----------+
|     N|         1|     73.04|
|     Y|         1|      8.88|
|     Y|         0|      0.44|
|     N|         0|     17.64|
+------+----------+----------+



## Apply Map

## Apply ReduceByKey

## Save