In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col,first, countDistinct, isnan, when, count, round, substring_index,substring, split, regexp_replace, udf
from tabulate import tabulate
import pandas as pd



 ## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.



 ## getOrCreate is used to create a SparkSession if not present.

In [2]:
print_reports = False

In [3]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()


In [4]:

sc=spark.sparkContext


 ## Read Data - SBAnational.csv

In [5]:

data_path="../data/SBAnational.csv"


In [6]:

loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)


In [7]:

loan_df.show(5)
print('=====================')
print("Number of rows in the dataframe:")
print('=====================')
loan_df_count = loan_df.count()
print(loan_df_count)
print('=====================')
print("Number of columns in the dataframe:")
print('=====================')
print(len(loan_df.columns))
print('=====================')
print("Schema of the dataframe:")
print('=====================')
loan_df.printSchema() #prints the dataframe schema
print('=====================')
print("Columns in the dataframe:")
print('=====================')
print(loan_df.columns) 


+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|LoanNr_ChkDgt|                Name|        City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|   1000014003|      ABC HOBBYCRAFT|  EVANSVILLE|

In [8]:
loan_df.describe().show()

+-------+--------------------+---------------+------------------+------+------------------+--------------------+---------+------------------+------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+----------+----------------+-----------------+------------+----------+------------+--------------+--------------+
|summary|       LoanNr_ChkDgt|           Name|              City| State|               Zip|                Bank|BankState|             NAICS|ApprovalDate|        ApprovalFY|              Term|             NoEmp|          NewExist|         CreateJob|       RetainedJob|     FranchiseCode|        UrbanRural|           RevLineCr|              LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|        GrAppv|      SBA_Appv|
+-------+--------------------+---------------+------------------+------+----

 # Preprocessing and cleaning

 ### Report

In [9]:
# =========================================================================
# =========================================================================
# ============================= DF REPORT =================================
# =========================================================================
# =========================================================================
def report_df(df, header):
    # Calculate the total number of rows
    rdd_count = df.count()

    # Initialize lists to store column statistics
    col_names = []
    data_types = []
    unique_samples = []
    num_uniques = []
    nan_percentages = []
    report_data = []

    # Iterate over each column
    for col_name in header:
        print(col_name)
        # Append column name
        col_names.append(col_name)
        selected_col = col(col_name)
        selected_col_df = df.select(selected_col)

        # Determine data type
        dtype = selected_col_df.dtypes[0][1]
        data_types.append(dtype)
        distinct_df = selected_col_df.distinct()
        # Collect unique values
        unique_sample = [row[col_name] for row in distinct_df.limit(2).collect()]
        unique_samples.append(unique_sample)

        # Count number of unique values
        n_unique = distinct_df.count()
        num_uniques.append(n_unique)

        # Calculate percentage of NaN values
        none_percentage_val = df.filter(selected_col.isNull()).count() / rdd_count * 100
        nan_percentages.append(none_percentage_val)
        report_data.append([col_name, dtype, unique_sample, n_unique, none_percentage_val])

    return report_data


In [10]:
if print_reports:
    report_res = report_df(loan_df, loan_df.columns)
    # Display the result
    column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
    print(tabulate(report_res, headers=column_names, tablefmt='grid'))




In [11]:

def show_percentage_of_each_value_in_column(df, df_count,col_name,show_num=10):
    # Calculate percentage of 0s and 1s
    percentage_df = df.groupBy(col_name).agg((count("*") / df_count).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # sort the dataframe by percentage descending
    percentage_df = percentage_df.sort(col("Percentage").desc())

    # Show result
    percentage_df.show(show_num)

def show_df_where_col_isnull(df, col_name, show_num):
    # Filter rows where 'Name' column is null
    filtered_df = df.filter(col(col_name).isNull())

    # Show the resulting DataFrame
    filtered_df.show(show_num)
    null_count = filtered_df.count()
    print(f"Null Count: {null_count}")

def print_unique_val_num_in_col(df, col_name):
    # percentage of unique values in the city
    unique_count = loan_df.select(col_name).distinct().count()
    print(f"Number of unique values in {col_name}: {unique_count}")
    percentage = unique_count / loan_df_count * 100
    print(f"Percentage of unique values in {col_name}: {percentage:.2f}%")

 ### 1. LoanNr_ChkDgt - ID

 Drop the column as it is an ID column and does not provide any information for the analysis.

In [12]:

loan_df = loan_df.drop('LoanNr_ChkDgt')


 ### 2. Name - Name of Borrower

 Drop the column as it is a name column and does not provide any information for the analysis.

In [13]:

col_name = 'Name'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name, show_num=5)

+-------------------+----------+
|               Name|Percentage|
+-------------------+----------+
|             SUBWAY|      0.14|
|      QUIZNO'S SUBS|      0.05|
|     DOMINO'S PIZZA|      0.04|
|COLD STONE CREAMERY|      0.04|
|        DAIRY QUEEN|      0.04|
+-------------------+----------+
only showing top 5 rows



In [14]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+
|Name|      City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|     GrAppv|   SBA_Appv|
+----+----------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+
|NULL|      TROY|   NY|12180|MANUFACTURERS & T...|       NY|     0|   16-Oct-98|      1999|  60|    2|       1|        0|          0|            1|    

In [15]:
# Fill null values in the 'Name' column with 'Unknown Company'
loan_df = loan_df.fillna({col_name: 'Unknown Name'})
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


In [16]:

print_unique_val_num_in_col(loan_df, col_name)


Number of unique values in Name: 779587
Percentage of unique values in Name: 86.70%


 Drop as most of the names are unique

In [17]:

# loan_df = loan_df.drop('Name')


 ### 3. City - City of Borrower



In [18]:

# Count the occurrences of each value in city column
col_name = 'City'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name, show_num=5)


+-----------+----------+
|       City|Percentage|
+-----------+----------+
|LOS ANGELES|      1.29|
|    HOUSTON|      1.14|
|   NEW YORK|      0.87|
|    CHICAGO|      0.67|
|      MIAMI|      0.62|
+-----------+----------+
only showing top 5 rows



In [19]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+----+-----+-----+--------------------+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|                Name|City|State|  Zip|                Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+--------------------+----+-----+-----+--------------------+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|  BUSATH PHOTOGRAPHY|NULL|   UT|84109|MOUNTAIN W. SMALL...|       UT|    0|   15-Dec-81|      1982| 300|    2|       

In [20]:
# Fill null values in the 'Name' column with 'Unknown Company'
loan_df = loan_df.fillna({col_name: 'Unknown City'})
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


In [21]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in City: 32582
Percentage of unique values in City: 3.62%


In [22]:

unique_city_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_city_df.show()


+--------------+-----+
|          City|Count|
+--------------+-----+
|   LOS ANGELES|11558|
|       HOUSTON|10247|
|      NEW YORK| 7846|
|       CHICAGO| 6036|
|         MIAMI| 5594|
|     SAN DIEGO| 5363|
|        DALLAS| 5085|
|       PHOENIX| 4493|
|     LAS VEGAS| 4390|
|   SPRINGFIELD| 3738|
|      BROOKLYN| 3728|
|        DENVER| 3550|
|   SAN ANTONIO| 3515|
|SALT LAKE CITY| 3511|
|        AUSTIN| 3499|
|       SEATTLE| 3470|
| SAN FRANCISCO| 3365|
|      PORTLAND| 3193|
|      COLUMBUS| 3186|
|  PHILADELPHIA| 3178|
+--------------+-----+
only showing top 20 rows



 ### 4. State - State of Borrower

In [23]:

col_name = 'State'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----+----------+
|State|Percentage|
+-----+----------+
|   CA|     14.53|
|   TX|      7.84|
|   NY|      6.42|
|   FL|      4.58|
|   PA|      3.91|
|   OH|      3.63|
|   IL|       3.3|
|   MA|      2.81|
|   MN|      2.71|
|   NJ|      2.67|
+-----+----------+
only showing top 10 rows



In [24]:
print_unique_val_num_in_col(loan_df, col_name)


Number of unique values in State: 52
Percentage of unique values in State: 0.01%


In [25]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+------------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|                Name|              City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+--------------------+------------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|SO. JERSEY DANCE/...|        PENNSVILLE| NULL| 8070|                NULL

In [26]:
# Filter rows where 'State' column is null
filtered_df = loan_df.filter(col(col_name).isNull())

# Extract unique values from 'Zip' column
unique_zips = filtered_df.select('Zip').distinct().collect()

# Extract unique zip codes as a list
unique_zip_list = [row['Zip'] for row in unique_zips]

# Print unique zip codes
print(unique_zip_list)

[54205, 76052, 76645, 67219, 33410, 75236, 84124, 79925, 8070, 95682, 65049, 96205, 54025, 0]


In [27]:
# Sort the DataFrame by 'Zip Code' in ascending order
df_sorted = loan_df.orderBy('Zip')

# Define a window specification for the group
window_spec = Window.partitionBy('Zip').orderBy('Zip')

# Fill the null 'State' values with the corresponding non-null 'State' value within each group
loan_df = df_sorted.withColumn(col_name, first(col_name, ignorenulls=True).over(window_spec))

# Show the resulting DataFrame
loan_df.show()

+--------------------+------------------+-----+---+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+--------------+------------+
|                Name|              City|State|Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|        GrAppv|    SBA_Appv|
+--------------------+------------------+-----+---+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+--------------+------------+
| Grass Valley Market|      Grass Valley|   OR|  0|WELLS FARGO BANK ...| 

In [28]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+-----------------+----------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+
|             Name|            City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|     GrAppv|   SBA_Appv|
+-----------------+----------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+
|THE COMPUTER EDGE|BOX 267, APO  AP| NULL|96205|RTC/WESTPORT SAVI...|       CA|541511|   21-Ja

Only 1 state with null values exist, so it is easy to look it up using its zip code.
Zip code 96205 exists in AP.

In [29]:
loan_df = loan_df.fillna({col_name: 'AP'})
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


 ### 5. Zip - Zip code of Borrower

In [30]:

col_name = 'Zip'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----+----------+
|  Zip|Percentage|
+-----+----------+
|10001|       0.1|
|90015|       0.1|
|93401|      0.09|
|90010|      0.08|
|33166|      0.07|
|90021|      0.07|
|65804|      0.07|
|59601|      0.07|
|92069|      0.06|
|84107|      0.06|
+-----+----------+
only showing top 10 rows



In [31]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in Zip: 33611
Percentage of unique values in Zip: 3.74%


Least zip code in the US starts with 502, so any value less than this is unvalid.

In [32]:
# Filter rows where 'Zip' column is less than 501
filtered_df = loan_df.filter(col('Zip') < 501)

# Extract unique values from the filtered 'Zip' column
unique_zips = filtered_df.select('Zip').distinct().collect()
all_zips = filtered_df.select('Zip').collect()

# Extract unique zip codes as a list
unique_zip_list = [row['Zip'] for row in unique_zips]
all_zip_list = [row['Zip'] for row in all_zips]

# Print unique zip codes
print("Unique Zip codes less than 501:")
print(unique_zip_list)
print(f"Count: {len(all_zip_list)}")

Unique Zip codes less than 501:
[211, 417, 128, 1, 182, 6, 3, 432, 5, 92, 207, 9, 290, 301, 4, 8, 7, 465, 438, 459, 38, 477, 98, 345, 204, 260, 2, 0]
Count: 400


In [33]:
# Filter rows where 'Zip' column is less than 501
loan_df = loan_df.filter(loan_df[col_name] >= 501)

In [34]:

unique_zip_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_zip_df.show()


+-----+-----+
|  Zip|Count|
+-----+-----+
|10001|  933|
|90015|  926|
|93401|  806|
|90010|  733|
|33166|  671|
|90021|  666|
|59601|  640|
|65804|  599|
| 3801|  581|
|59101|  578|
|84115|  577|
|92121|  567|
|92101|  565|
|90670|  531|
|11354|  530|
|59102|  519|
|85260|  518|
|84107|  513|
|89102|  500|
|92069|  499|
+-----+-----+
only showing top 20 rows



Cast Zip to string to treat is as a categorical feature.

In [35]:
# Cast the 'Zip' column to string
loan_df = loan_df.withColumn('Zip', col('Zip').cast('string'))

In [36]:

# loan_df = loan_df.drop(col_name)


 ### 6. Bank - Name of the bank that gave the loan

In [37]:

col_name = 'Bank'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------------------+----------+
|                Bank|Percentage|
+--------------------+----------+
|BANK OF AMERICA N...|      9.66|
|WELLS FARGO BANK ...|      7.06|
|JPMORGAN CHASE BA...|      5.35|
|U.S. BANK NATIONA...|      3.91|
|CITIZENS BANK NAT...|       3.9|
|PNC BANK, NATIONA...|      3.04|
|           BBCN BANK|      2.55|
|CAPITAL ONE NATL ...|      2.47|
|MANUFACTURERS & T...|      1.25|
|READYCAP LENDING,...|      1.19|
+--------------------+----------+
only showing top 10 rows



In [38]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+-------------+-----+----+----+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|                Name|         City|State| Zip|Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+--------------------+-------------+-----+----+----+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|BERKSHIRE HARDWOO...| CHESTERFIELD|   MA|1012|NULL|     NULL|     0|    5-Dec-89|      1990| 180|   14|       1|        0|          0|    

In [39]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in Bank: 5803
Percentage of unique values in Bank: 0.65%


In [40]:
# Fill null values in the 'Bank' column with 'Unknown Bank'
loan_df = loan_df.fillna({col_name: 'Unknown Bank'})
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


 ### 7. BankState - State of Bank

In [41]:

col_name = 'BankState'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+---------+----------+
|BankState|Percentage|
+---------+----------+
|       CA|     13.13|
|       NC|      8.84|
|       IL|      7.33|
|       OH|       6.5|
|       SD|      5.68|
|       TX|      5.31|
|       RI|      5.04|
|       NY|       4.4|
|       VA|      3.22|
|       DE|      2.73|
+---------+----------+
only showing top 10 rows



In [42]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+-------------+-----+----+------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|                Name|         City|State| Zip|        Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+--------------------+-------------+-----+----+------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|BERKSHIRE HARDWOO...| CHESTERFIELD|   MA|1012|Unknown Bank|     NULL|     0|    5-Dec-89|      1990| 180|   14|   

In [43]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in BankState: 57
Percentage of unique values in BankState: 0.01%


Drop nulls as we cant populate them.

In [44]:
loan_df = loan_df.dropna(subset=[col_name])

In [45]:
show_df_where_col_isnull(loan_df, "BankState", show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


 ### 8. NAICS - North American Industry Classification System code for the industry where the business is located

In [46]:

col_name='NAICS'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
| NAICS|Percentage|
+------+----------+
|     0|     22.34|
|722110|      3.11|
|722211|      2.16|
|811111|      1.62|
|621210|      1.56|
|624410|      1.12|
|812112|      1.03|
|561730|      0.99|
|621310|      0.97|
|812320|      0.88|
+------+----------+
only showing top 10 rows



In [47]:

print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in NAICS: 1312
Percentage of unique values in NAICS: 0.15%


In [48]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
|Name|City|State|Zip|Bank|BankState|NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+
+----+----+-----+---+----+---------+-----+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+


In [49]:

# Convert NAICS code into related sector

# Extract first two characters of NAICS code
first_two_chars = substring(loan_df["NAICS"], 1, 2)
# print(first_two_chars)[0]

# Apply mapping using when and otherwise
loan_df = loan_df.withColumn("Sector",
    first_two_chars
)
loan_df = loan_df.drop("NAICS")
col_name='Sector'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------+----------+
|Sector|Percentage|
+------+----------+
|     0|     22.34|
|    44|      9.41|
|    81|      8.07|
|    54|      7.57|
|    72|      7.52|
|    23|       7.4|
|    62|      6.15|
|    42|      5.41|
|    45|      4.72|
|    33|      4.25|
+------+----------+
only showing top 10 rows



In [50]:

naics_to_sector = {
    11: 'Agriculture, Forestry, Fishing and Hunting',
    21: 'Mining, Quarrying, and Oil and Gas Extraction',
    22: 'Utilities',
    23: 'Construction',
    31: 'Manufacturing',
    32: 'Manufacturing',
    33: 'Manufacturing',
    42: 'Wholesale Trade',
    44: 'Retail Trade',
    45: 'Retail Trade',
    48: 'Transportation and Warehousing',
    49: 'Transportation and Warehousing',
    51: 'Information',
    52: 'Finance and Insurance',
    53: 'Real Estate and Rental and Leasing',
    54: 'Professional, Scientific, and Technical Services',
    55: 'Management of Companies and Enterprises',
    56: 'Administrative and Support and Waste Management and Remediation Services',
    61: 'Educational Services',
    62: 'Health Care and Social Assistance',
    71: 'Arts, Entertainment, and Recreation',
    72: 'Accommodation and Food Services',
    81: 'Other Services (except Public Administration)',
    92: 'Public Administration'
}

loan_df = loan_df.withColumn(col_name, 
                             when(col(col_name) == 32, 31)
                             .when(col(col_name) == 33, 31)
                             .when(col(col_name) == 45, 44)
                             .when(col(col_name) == 49, 48)
                             .otherwise(col(col_name)))
# Cast the 'Zip' column to string
# loan_df = loan_df.withColumn('Sector', col('Sector').cast('string'))
# # Convert NAICS codes to their corresponding sectors
# loan_df_temp = loan_df.withColumn(col_name, 
#                    when(col(col_name).isin(list(naics_to_sector.keys())), 
#                         naics_to_sector[col(col_name).cast("int")])  # Cast to int before accessing dictionary
#                    .otherwise("Unknown"))

# loan_df_temp.show(5)


In [51]:
loan_df =loan_df.dropna(subset=["Sector"]) 

In [52]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------+----------+
|Sector|Percentage|
+------+----------+
|     0|     22.34|
|    44|     14.13|
|    81|      8.07|
|    54|      7.57|
|    31|      7.55|
|    72|      7.52|
|    23|       7.4|
|    62|      6.15|
|    42|      5.41|
|    56|      3.63|
+------+----------+
only showing top 10 rows



 ### 9. ApprovalDate - Date SBA commitment issued

In [53]:

col_name = 'ApprovalDate'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------------+----------+
|ApprovalDate|Percentage|
+------------+----------+
|    7-Jul-93|      0.12|
|   30-Jan-04|      0.11|
|    8-Jul-93|      0.09|
|   30-Sep-03|      0.07|
|    4-Oct-04|      0.07|
|   21-Jan-05|      0.06|
|   30-Jun-05|      0.06|
|   18-Apr-05|      0.06|
|    6-Jul-93|      0.06|
|   27-Sep-02|      0.06|
+------------+----------+
only showing top 10 rows



In [54]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+
|Name|City|State|Zip|Bank|BankState|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|
+----+----+-----+---+----+---------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+
+----+----+-----+---+----+---------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+----

In [55]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in ApprovalDate: 9835
Percentage of unique values in ApprovalDate: 1.09%


In [56]:

# the full date has too much detail, so we will extract the month only

loan_df = loan_df.withColumn("ApprovalMonth", split(col(col_name), "-")[1])
loan_df = loan_df.drop(col_name)
col_name = 'ApprovalMonth'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,"ApprovalMonth")


+-------------+----------+
|ApprovalMonth|Percentage|
+-------------+----------+
|          Mar|      9.28|
|          Sep|      9.22|
|          Apr|       8.9|
|          Aug|      8.75|
|          Jun|      8.69|
|          May|      8.57|
|          Jul|      8.48|
|          Dec|      7.76|
|          Oct|      7.74|
|          Nov|      7.59|
+-------------+----------+
only showing top 10 rows



 ### 10. ApprovalFY - Fiscal Year of commitment

 Drop the column as it is a date column and does not provide any information for the analysis.

In [57]:

loan_df = loan_df.drop('ApprovalFY')


 ### 11. Term - Loan term in months

In [58]:

col_name = 'Term'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+----+----------+
|Term|Percentage|
+----+----------+
|  84|     25.56|
|  60|      9.97|
| 240|      9.54|
| 120|      8.62|
| 300|      4.97|
| 180|      3.12|
|  36|       2.2|
|  12|      1.89|
|  48|      1.73|
|  72|      1.04|
+----+----------+
only showing top 10 rows



In [59]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+

Null Count: 0


In [60]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in Term: 412
Percentage of unique values in Term: 0.05%


In [61]:

# loan_df = loan_df.withColumn("Term_category", 
#                              when((col(col_name) <=90),'Below 3 months')
#                              .when(((col(col_name)>90) & (col(col_name)<=180)), '3-6 months')
#                              .when(((col(col_name)>180) & (col(col_name)<=365)),  '6-12 months')
#                              .otherwise('More Than a Year'))
# loan_df = loan_df.drop(col_name)
# col_name = "Term_category"
# show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

 ### 12. NoEmp - Number of Business Employees

In [62]:

col_name = 'NoEmp'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----+----------+
|NoEmp|Percentage|
+-----+----------+
|    1|     17.13|
|    2|     15.36|
|    3|     10.06|
|    4|      8.17|
|    5|       6.7|
|    6|      5.08|
|   10|       3.5|
|    7|      3.49|
|    8|      3.48|
|   12|      2.31|
+-----+----------+
only showing top 10 rows



In [63]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+

Null Count: 0


In [64]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in NoEmp: 599
Percentage of unique values in NoEmp: 0.07%


 ### 13. NewExist - 1 = Existing business, 2 = New business

In [65]:

col_name = 'NewExist'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.56|
|       2|     28.09|
|       0|      0.11|
|    NULL|      0.02|
+--------+----------+



In [66]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+---------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+-----------+------+-------------+
|                Name|           City|State| Zip|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|   SBA_Appv|Sector|ApprovalMonth|
+--------------------+---------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+-----------+------+-------------+
|     Eurostoves, Inc|        BEVERLY|   MA|1915|        BEVERLY BANK|       MA|  84|   25|    NULL|        0|        

 Drop rows with 0 or Null

In [67]:

col_name = 'NewExist'
loan_df = loan_df.filter(loan_df[col_name] != 0)
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()

In [68]:

show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.81|
|       2|     28.19|
+--------+----------+



Convert it to boolean, '2' is true, '1' is false.

In [69]:

loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "2", 1)
                   .otherwise(0)
                   .cast("int"))


In [70]:

show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       0|     71.81|
|       1|     28.19|
+--------+----------+



 ### 14. CreateJob - Number of jobs created

In [71]:

col_name='CreateJob'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+---------+----------+
|CreateJob|Percentage|
+---------+----------+
|        0|     69.92|
|        1|      7.05|
|        2|      6.45|
|        3|      3.21|
|        4|      2.29|
|        5|      2.08|
|       10|      1.29|
|        6|      1.23|
|        8|      0.82|
|        7|      0.71|
+---------+----------+
only showing top 10 rows



In [72]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+

Null Count: 0


In [73]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in CreateJob: 245
Percentage of unique values in CreateJob: 0.03%


In [74]:

# loan_df = loan_df.drop(col_name)


 ### 15. RetainedJob - Number of jobs retained

In [75]:

col_name='RetainedJob'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----------+----------+
|RetainedJob|Percentage|
+-----------+----------+
|          0|     48.85|
|          1|       9.9|
|          2|      8.57|
|          3|      5.57|
|          4|      4.42|
|          5|      3.64|
|          6|      2.65|
|          7|      1.84|
|          8|      1.75|
|         10|      1.72|
+-----------+----------+
only showing top 10 rows



In [76]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+

Null Count: 0


In [77]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in RetainedJob: 358
Percentage of unique values in RetainedJob: 0.04%


In [78]:

# loan_df = loan_df.drop(col_name)


 ### 16. FranchiseCode - Franchise code, (00000 or 00001) = No franchise

In [79]:

col_name='FranchiseCode'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-------------+----------+
|FranchiseCode|Percentage|
+-------------+----------+
|            1|     71.01|
|            0|     23.23|
|        78760|      0.38|
|        68020|      0.21|
|        50564|      0.11|
|        21780|      0.11|
|        25650|      0.08|
|        79140|      0.07|
|        22470|      0.07|
|        17998|      0.07|
+-------------+----------+
only showing top 10 rows



In [80]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+

Null Count: 0


In [81]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in FranchiseCode: 2763
Percentage of unique values in FranchiseCode: 0.31%


 We don't care about the franchise code, we only care if there is a franchise or not

In [82]:

# make 0 or 1 = 0, anything else = 1
loan_df = loan_df.withColumn("IsFranchise", when((col(col_name) == 0) | (col(col_name) == 1), 0).otherwise(1))


In [83]:

col_name = 'IsFranchise'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----------+----------+
|IsFranchise|Percentage|
+-----------+----------+
|          0|     94.24|
|          1|      5.76|
+-----------+----------+



In [84]:

loan_df = loan_df.drop('FranchiseCode')


 ### 17. UrbanRural - 1 = Urban, 2 = rural, 0 = undefined

In [85]:

col_name = 'UrbanRural'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----------+----------+
|UrbanRural|Percentage|
+----------+----------+
|         1|     52.46|
|         0|     35.81|
|         2|     11.73|
+----------+----------+



In [86]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|IsFranchise|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+

Null Count: 0


In [87]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in UrbanRural: 3
Percentage of unique values in UrbanRural: 0.00%


 ### 18. RevLineCr - Revolving line of credit: Y = Yes, N = No

In [88]:

col_name = 'RevLineCr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|      46.6|
|        0|     28.72|
|        Y|     22.46|
|        T|       1.7|
|     NULL|       0.5|
|        7|       0.0|
|        3|       0.0|
|        Q|       0.0|
|        5|       0.0|
|        .|       0.0|
+---------+----------+
only showing top 10 rows



In [89]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+-------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+------+-------------+-----------+
|                Name|         City|State| Zip|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|Sector|ApprovalMonth|IsFranchise|
+--------------------+-------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+------+-------------+-----------+
|PASKO TRANSPORT C...|   CUMMINGTON|   MA|1026|TD BANK, NATIONAL...|       MA|  60|    1|       1|        0|          0|       

In [90]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in RevLineCr: 19
Percentage of unique values in RevLineCr: 0.00%


 Filter only N and Y

In [91]:

col_name = 'RevLineCr'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


Number of rows before filtering: 896032
Number of rows after filtering: 618822
+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     67.48|
|        Y|     32.52|
+---------+----------+



 Transform N and Y to 0 and 1

In [92]:

loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "Y", 1)
                   .otherwise(0)
                   .cast("int"))


 ### 19. LowDoc - LowDoc Loan Program: Y = Yes, N = No

In [93]:

col_name = "LowDoc"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     89.93|
|     Y|      9.24|
|  NULL|      0.41|
|     0|      0.19|
|     C|      0.11|
|     S|      0.09|
|     A|      0.02|
|     R|      0.01|
|     1|       0.0|
+------+----------+



In [94]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+--------------------+--------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+------+-------------+-----------+
|                Name|          City|State| Zip|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|     GrAppv|   SBA_Appv|Sector|ApprovalMonth|IsFranchise|
+--------------------+--------------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+-----------+-----------+------+-------------+-----------+
| Meadow Brook Realty|       AMHERST|   MA|1002|FLORENCE SAVINGS ...|       MA|  84|    2|       0|        0|          2|         

In [95]:
print_unique_val_num_in_col(loan_df, col_name)

Number of unique values in LowDoc: 9
Percentage of unique values in LowDoc: 0.00%


 Filter only N and Y

In [96]:

col_name = 'LowDoc'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


Number of rows before filtering: 618822
Number of rows after filtering: 613725
+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     90.68|
|     Y|      9.32|
+------+----------+



 Transform N and Y to 0 and 1

In [97]:

loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "Y", 1)
                   .otherwise(0)
                   .cast("int"))


 ### 20. ChgOffDate - The date when a loan is declared to be in default

 Drop the column due to the high number of missing values.

In [98]:

loan_df = loan_df.drop('ChgOffDate')


 ### 21. DisbursementDate - Date when loan was disbursed

In [99]:

loan_df = loan_df.drop('DisbursementDate')


 ### 22. DisbursementGross - Amount disbursed

In [100]:
col_name = "DisbursementGross"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+-----------------+----------+
|DisbursementGross|Percentage|
+-----------------+----------+
|      $50,000.00 |      4.93|
|     $100,000.00 |       4.0|
|      $25,000.00 |      3.31|
|      $10,000.00 |       2.4|
|     $150,000.00 |      1.78|
|      $35,000.00 |      1.75|
|      $20,000.00 |      1.62|
|      $75,000.00 |      1.47|
|      $15,000.00 |      1.41|
|      $30,000.00 |      1.38|
+-----------------+----------+
only showing top 10 rows



In [101]:
loan_df = loan_df.withColumn("clean_DisbursementGross", regexp_replace("DisbursementGross", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_DisbursementGross", regexp_replace("clean_DisbursementGross", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_DisbursementGross", col("clean_DisbursementGross").cast("float"))
col_name = "clean_DisbursementGross"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----------------------+----------+
|clean_DisbursementGross|Percentage|
+-----------------------+----------+
|                50000.0|      4.93|
|               100000.0|       4.0|
|                25000.0|      3.31|
|                10000.0|       2.4|
|               150000.0|      1.78|
|                35000.0|      1.75|
|                20000.0|      1.62|
|                75000.0|      1.47|
|                15000.0|      1.41|
|                30000.0|      1.38|
+-----------------------+----------+
only showing top 10 rows



In [102]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+-----------------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|GrAppv|SBA_Appv|Sector|ApprovalMonth|IsFranchise|clean_DisbursementGross|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+-----------------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+-----------------+------------+----------+------------+------+--------+------+-------------+-----------+-----------------------+

Null Count: 0


In [103]:

loan_df = loan_df.drop('DisbursementGross')


 ### 23. BalanceGross - Gross amount outstanding

In [104]:

col_name = 'BalanceGross'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------------+----------+
|BalanceGross|Percentage|
+------------+----------+
|      $0.00 |     100.0|
| $25,000.00 |       0.0|
|  $1,760.00 |       0.0|
| $84,617.00 |       0.0|
| $37,100.00 |       0.0|
|$827,875.00 |       0.0|
| $43,127.00 |       0.0|
|$996,262.00 |       0.0|
+------------+----------+



 Drop as most of the values are 0

In [105]:

loan_df = loan_df.drop('BalanceGross')


 ### 24. MIS_Status - Target variable

 Delete rows that have null target value (MIS_Status)

In [106]:

col_name ="MIS_Status"
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)


+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.68|
|    CHGOFF|     18.04|
|      NULL|      0.28|
+----------+----------+



In [107]:

# drop rows with null values in MIS_Status column
loan_df = loan_df.dropna(subset=[col_name])
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)



+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.68|
|    CHGOFF|     18.04|
+----------+----------+



 ### Replace target values with 0 and 1

 Target value column is: MIS_Status

 "P I F" = 1

 "CHGOFF" = 0

In [108]:

loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "P I F", 1)
                   .otherwise(0)
                   .cast("int"))


 Show the percentage of:

 - Paid in full loans (approved loans), MIS_Status = 1

 - Charged off loans (rejected loans), MIS_Status = 0

In [109]:

show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)


+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|         1|     81.68|
|         0|     18.04|
+----------+----------+



Place target column at the end

In [110]:
# Assuming df is your DataFrame and column_name is the name of the column you want to move to the end
column_name = "MIS_Status"

# Get the current column names
current_columns = loan_df.columns

# Select columns excluding the column to be moved to the end
new_columns = [col for col in current_columns if col != column_name]

# Add the column to be moved to the end
new_columns.append(column_name)

# Reorder the DataFrame with the new column order
loan_df = loan_df.select(*new_columns)


 ### 25. ChgOffPrinGr - Charged-off amount

In [111]:

col_name = 'ChgOffPrinGr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------------+----------+
|ChgOffPrinGr|Percentage|
+------------+----------+
|      $0.00 |     81.27|
| $50,000.00 |      0.29|
| $10,000.00 |      0.22|
| $25,000.00 |      0.21|
| $35,000.00 |      0.18|
|$100,000.00 |      0.14|
| $20,000.00 |      0.08|
| $30,000.00 |      0.07|
| $15,000.00 |      0.06|
|  $5,000.00 |      0.05|
+------------+----------+
only showing top 10 rows



In [112]:
loan_df = loan_df.withColumn("clean_ChgOffPrinGr", regexp_replace("ChgOffPrinGr", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_ChgOffPrinGr", regexp_replace("clean_ChgOffPrinGr", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_ChgOffPrinGr", col("clean_ChgOffPrinGr").cast("float"))
col_name = "clean_ChgOffPrinGr"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------------+----------+
|clean_ChgOffPrinGr|Percentage|
+------------------+----------+
|               0.0|     81.27|
|           50000.0|      0.29|
|           10000.0|      0.22|
|           25000.0|      0.21|
|           35000.0|      0.18|
|          100000.0|      0.14|
|           20000.0|      0.08|
|           30000.0|      0.07|
|           15000.0|      0.06|
|            5000.0|      0.05|
+------------------+----------+
only showing top 10 rows



 Drop this column as it will leak info to the column, because if the value is 0, this means that the loan is charged off

In [113]:

loan_df = loan_df.drop('ChgOffPrinGr')


 ### 26. GrAppv - Gross amount of loan approved by bank

In [114]:

col_name = "GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------------+----------+
|      GrAppv|Percentage|
+------------+----------+
| $50,000.00 |      8.78|
| $25,000.00 |      6.95|
|$100,000.00 |      6.09|
| $10,000.00 |      4.94|
| $20,000.00 |      3.15|
| $35,000.00 |      3.01|
| $30,000.00 |      2.65|
| $15,000.00 |      2.37|
|$150,000.00 |      2.32|
| $75,000.00 |      2.16|
+------------+----------+
only showing top 10 rows



 #### Clean this column

 - Remove $

 - Remove ,

 - Convert to float

In [115]:

loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("GrAppv", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("clean_GrAppv", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_GrAppv", col("clean_GrAppv").cast("float"))
col_name = "clean_GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+------------+----------+
|clean_GrAppv|Percentage|
+------------+----------+
|     50000.0|      8.78|
|     25000.0|      6.95|
|    100000.0|      6.09|
|     10000.0|      4.94|
|     20000.0|      3.15|
|     35000.0|      3.01|
|     30000.0|      2.65|
|     15000.0|      2.37|
|    150000.0|      2.32|
|     75000.0|      2.16|
+------------+----------+
only showing top 10 rows



In [116]:
show_df_where_col_isnull(loan_df, "clean_GrAppv", show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|GrAppv|SBA_Appv|Sector|ApprovalMonth|IsFranchise|clean_DisbursementGross|MIS_Status|clean_ChgOffPrinGr|clean_GrAppv|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+

Null Count: 0


In [117]:

loan_df = loan_df.drop('GrAppv')


 ### 27. SBA_Appv - SBA's guaranteed amount of approved loan

 Drop as we don't know this amount in the future

In [118]:
col_name = 'SBA_Appv'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----------+----------+
|   SBA_Appv|Percentage|
+-----------+----------+
|$25,000.00 |      6.59|
|$12,500.00 |      5.59|
| $5,000.00 |      4.39|
|$50,000.00 |      3.35|
|$10,000.00 |      2.36|
|$17,500.00 |       2.1|
|$15,000.00 |      1.93|
| $7,500.00 |      1.77|
|$37,500.00 |      1.33|
| $2,500.00 |      1.29|
+-----------+----------+
only showing top 10 rows



In [119]:
loan_df = loan_df.withColumn("clean_SBA_Appv", regexp_replace("SBA_Appv", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_SBA_Appv", regexp_replace("clean_SBA_Appv", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_SBA_Appv", col("clean_SBA_Appv").cast("float"))
col_name = "clean_SBA_Appv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------------+----------+
|clean_SBA_Appv|Percentage|
+--------------+----------+
|       25000.0|      6.59|
|       12500.0|      5.59|
|        5000.0|      4.39|
|       50000.0|      3.35|
|       10000.0|      2.36|
|       17500.0|       2.1|
|       15000.0|      1.93|
|        7500.0|      1.77|
|       37500.0|      1.33|
|        2500.0|      1.29|
+--------------+----------+
only showing top 10 rows



In [120]:
show_df_where_col_isnull(loan_df, col_name, show_num=5)

+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+--------------+
|Name|City|State|Zip|Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|SBA_Appv|Sector|ApprovalMonth|IsFranchise|clean_DisbursementGross|MIS_Status|clean_ChgOffPrinGr|clean_GrAppv|clean_SBA_Appv|
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+--------------+
+----+----+-----+---+----+---------+----+-----+--------+---------+-----------+----------+---------+------+--------+------+-------------+-----------+-----------------------+----------+------------------+------------+--------------+

Null Count: 0


In [121]:
loan_df = loan_df.drop('SBA_Appv')


 ### Final schema

In [122]:

loan_df.printSchema()


root
 |-- Name: string (nullable = false)
 |-- City: string (nullable = false)
 |-- State: string (nullable = false)
 |-- Zip: string (nullable = true)
 |-- Bank: string (nullable = false)
 |-- BankState: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = false)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: integer (nullable = false)
 |-- LowDoc: integer (nullable = false)
 |-- Sector: string (nullable = true)
 |-- ApprovalMonth: string (nullable = true)
 |-- IsFranchise: integer (nullable = false)
 |-- clean_DisbursementGross: float (nullable = true)
 |-- MIS_Status: integer (nullable = false)
 |-- clean_ChgOffPrinGr: float (nullable = true)
 |-- clean_GrAppv: float (nullable = true)
 |-- clean_SBA_Appv: float (nullable = true)



 ### Check duplicated rows based on all columns



In [123]:

print("Number of duplicate rows in the dataframe:")
loan_df_duplicates = loan_df_count - loan_df.dropDuplicates().count()
print(loan_df_duplicates)
loan_df = loan_df.dropDuplicates()

Number of duplicate rows in the dataframe:


1879


 ### Final DF Count

In [124]:

loan_df_count = loan_df.count()
print(f"Final DF count: {loan_df_count}")


Final DF count: 611846


In [125]:
if print_reports:
    report_res = report_df(loan_df, loan_df.columns)
    # Display the result
    column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
    print(tabulate(report_res, headers=column_names, tablefmt='grid'))

In [126]:
# output_path = "../data/preprocessed.csv"

# # Save the DataFrame to a CSV file
# loan_df.write.csv(output_path, header=True, mode="overwrite")

In [127]:
# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = loan_df.toPandas()

# Specify the path where you want to save the CSV file
output_path = "../data/preprocessed.csv"

# Save the Pandas DataFrame to a CSV file
pandas_df.to_csv(output_path, index=False)

In [128]:
sample_size = 50000
# Save a sample
output_path = f"../sample_data/{sample_size}.csv"

# Save the first 50000 rows of the Pandas DataFrame to a CSV file
pandas_df.head(sample_size).to_csv(output_path, index=False)

In [129]:
sample_size = 1000
# Save a sample
output_path = f"../sample_data/{sample_size}.csv"

# Save the first 50000 rows of the Pandas DataFrame to a CSV file
pandas_df.head(sample_size).to_csv(output_path, index=False)