# **Mount Drive**

In [1]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.chdir('/content/drive/My Drive/Colab Notebooks/Big Data Project')

Mounted at /content/drive


In [None]:
# !pip install pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType


In [2]:
import pandas as pd
import datetime
import csv
from tabulate import tabulate



## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [3]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()

In [4]:
sc=spark.sparkContext

## Read Data - SBAnational.csv

In [5]:
data_path="SBAnational.csv"

In [6]:
loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)

In [7]:
loan_df.show(5)
print('=====================')
print("Number of rows in the dataframe:")
loan_df_count = loan_df.count()
print(loan_df_count)
print('=====================')
print("Schema of the dataframe:")
loan_df.printSchema() #prints the dataframe schema
print('=====================')
print("Columns in the dataframe:")
print(loan_df.columns) 

+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|LoanNr_ChkDgt|                Name|        City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|   1000014003|      ABC HOBBYCRAFT|  EVANSVILLE|

# Preprocessing

### Report

In [20]:
# =========================================================================
# =========================================================================
# ============================= DF REPORT =================================
# =========================================================================
# =========================================================================
def report_df(df, header):
    # Calculate the total number of rows
    rdd_count = df.count()

    # Initialize lists to store column statistics
    col_names = []
    data_types = []
    unique_samples = []
    num_uniques = []
    nan_percentages = []
    report_data = []

    # Iterate over each column
    for col_name in header:
        print(col_name)
        # Append column name
        col_names.append(col_name)
        selected_col = col(col_name)
        selected_col_df = df.select(selected_col)

        # Determine data type
        dtype = selected_col_df.dtypes[0][1]
        data_types.append(dtype)
        distinct_df = selected_col_df.distinct()
        # Collect unique values
        unique_sample = [row[col_name] for row in distinct_df.limit(2).collect()]
        unique_samples.append(unique_sample)

        # Count number of unique values
        n_unique = distinct_df.count()
        num_uniques.append(n_unique)

        # Calculate percentage of NaN values
        none_percentage_val = df.filter(selected_col.isNull()).count() / rdd_count * 100
        nan_percentages.append(none_percentage_val)
        report_data.append([col_name, dtype, unique_sample, n_unique, none_percentage_val])

    return report_data

In [21]:

%%time
report_res = report_df(loan_df, loan_df.columns)


LoanNr_ChkDgt
Name
City
State
Zip
Bank
BankState
NAICS
ApprovalDate
ApprovalFY
Term
NoEmp
NewExist
CreateJob
RetainedJob
FranchiseCode
UrbanRural
RevLineCr
LowDoc
ChgOffDate
DisbursementDate
DisbursementGross
BalanceGross
MIS_Status
ChgOffPrinGr
GrAppv
SBA_Appv
CPU times: total: 0 ns
Wall time: 2min 29s


In [22]:

# Display the result
column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
print(tabulate(report_res, headers=column_names, tablefmt='grid'))

+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Column            | Type   | Unique Sample                                                       |   N Unique |        %None |
| LoanNr_ChkDgt     | bigint | [1000895005, 1001055002]                                            |     899164 |  0           |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Name              | string | ['TURTLE BEACH INN', 'URBAN BEAST-SEATTLE LLC']                     |     779587 |  0.000333643 |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| City              | string | ['Worcester', 'West Sand Lake']                                     |      32582 |  0.00333643  |
+-------------------+--------+-------------------------------------------------------------------

### Drop columns that have many missing values

### ChgOffDate

In [15]:
loan_df = loan_df.drop('ChgOffDate')

In [16]:
loan_df.printSchema()

root
 |-- LoanNr_ChkDgt: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- NAICS: string (nullable = true)
 |-- ApprovalDate: string (nullable = true)
 |-- ApprovalFY: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- FranchiseCode: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: string (nullable = true)
 |-- LowDoc: string (nullable = true)
 |-- DisbursementDate: string (nullable = true)
 |-- DisbursementGross: string (nullable = true)
 |-- BalanceGross: string (nullable = true)
 |-- MIS_Status: string (nullable = true)
 |-- ChgOffPrinGr: string (nullable = true)
 |-- GrAppv

In [10]:
def show_percentage_of_each_value_in_column(df, df_count,col_name):
    # Calculate percentage of 0s and 1s
    percentage_df = df.groupBy(col_name).agg((count("*") / df_count).alias("percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("percentage", round(col("percentage") * 100, 2))

    # Show result
    percentage_df.show()

### MIS_Status

Delete rows that have null target value (MIS_Status)

In [11]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, 'MIS_Status')

+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|      NULL|      0.22|
|     P I F|     82.26|
|    CHGOFF|     17.52|
+----------+----------+



In [12]:
# drop rows with null values in MIS_Status column
loan_df = loan_df.dropna(subset=['MIS_Status'])
show_percentage_of_each_value_in_column(loan_df, loan_df_count, 'MIS_Status')


+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|     P I F|     82.26|
|    CHGOFF|     17.52|
+----------+----------+



### Replace target values with 0 and 1
Target value column is: MIS_Status
"P I F" = 1
"CHGOFF" = 0

In [13]:
loan_df = loan_df.withColumn("MIS_Status", 
                   when(col("MIS_Status") == "P I F", 1)
                   .otherwise(0)
                   .cast("int"))

In [14]:
loan_df.printSchema()

root
 |-- LoanNr_ChkDgt: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- NAICS: integer (nullable = true)
 |-- ApprovalDate: string (nullable = true)
 |-- ApprovalFY: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- FranchiseCode: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: string (nullable = true)
 |-- LowDoc: string (nullable = true)
 |-- ChgOffDate: string (nullable = true)
 |-- DisbursementDate: string (nullable = true)
 |-- DisbursementGross: string (nullable = true)
 |-- BalanceGross: string (nullable = true)
 |-- MIS_Status: integer (nullable = false)
 |-- ChgO

Show the percentage of:
- Paid in full loans (approved loans), MIS_Status = 1
- Charged off loans (rejected loans), MIS_Status = 0

In [15]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, 'MIS_Status')

+----------+----------+
|MIS_Status|percentage|
+----------+----------+
|         1|     82.26|
|         0|     17.52|
+----------+----------+



### Check duplicated rows based on all columns


In [16]:
# sum of duplicate rows
print("Number of duplicate rows in the dataframe:")
loan_df_duplicates = loan_df_count - loan_df.dropDuplicates().count()
print(loan_df_duplicates)

Number of duplicate rows in the dataframe:
0


# Exploratory Data Analysis (EDA)

In [23]:
def feature_vs_target(df, feature):
    target = "MIS_Status"
    # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
    percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("percentage", round(col("percentage") * 100, 2))

    # Show result
    percentage_df.show()

In [24]:
feature_vs_target(loan_df, "NewExist")

+--------+----------+----------+
|NewExist|MIS_Status|percentage|
+--------+----------+----------+
|       1|     P I F|     59.31|
|       0|     P I F|      0.11|
|    NULL|      NULL|       0.0|
|       1|      NULL|      0.16|
|    NULL|     P I F|      0.01|
|       2|    CHGOFF|      5.27|
|    NULL|    CHGOFF|       0.0|
|       0|    CHGOFF|      0.01|
|       2|     P I F|     22.82|
|       2|      NULL|      0.06|
|       0|      NULL|       0.0|
|       1|    CHGOFF|     12.25|
+--------+----------+----------+



## Apply Map

## Apply ReduceByKey

## Save