# **Mount Drive**

In [1]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.chdir('/content/drive/My Drive/Colab Notebooks/Big Data Project')

In [2]:
# !pip install pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round, substring_index,substring, split, regexp_replace, udf
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, LinearSVC

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tabulate import tabulate




## Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores.

## getOrCreate is used to create a SparkSession if not present.

In [2]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()

In [3]:
sc=spark.sparkContext

## Read Data - SBAnational.csv

In [4]:
data_path="SBAnational.csv"

In [5]:
loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)

In [6]:
loan_df.show(5)
print('=====================')
print("Number of rows in the dataframe:")
loan_df_count = loan_df.count()
print(loan_df_count)
print('=====================')
print("Schema of the dataframe:")
loan_df.printSchema() #prints the dataframe schema
print('=====================')
print("Columns in the dataframe:")
print(loan_df.columns) 

+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|LoanNr_ChkDgt|                Name|        City|State|  Zip|                Bank|BankState| NAICS|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|
+-------------+--------------------+------------+-----+-----+--------------------+---------+------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+
|   1000014003|      ABC HOBBYCRAFT|  EVANSVILLE|

# Preprocessing and cleaning

### Report

In [7]:
# =========================================================================
# =========================================================================
# ============================= DF REPORT =================================
# =========================================================================
# =========================================================================
def report_df(df, header):
    # Calculate the total number of rows
    rdd_count = df.count()

    # Initialize lists to store column statistics
    col_names = []
    data_types = []
    unique_samples = []
    num_uniques = []
    nan_percentages = []
    report_data = []

    # Iterate over each column
    for col_name in header:
        print(col_name)
        # Append column name
        col_names.append(col_name)
        selected_col = col(col_name)
        selected_col_df = df.select(selected_col)

        # Determine data type
        dtype = selected_col_df.dtypes[0][1]
        data_types.append(dtype)
        distinct_df = selected_col_df.distinct()
        # Collect unique values
        unique_sample = [row[col_name] for row in distinct_df.limit(2).collect()]
        unique_samples.append(unique_sample)

        # Count number of unique values
        n_unique = distinct_df.count()
        num_uniques.append(n_unique)

        # Calculate percentage of NaN values
        none_percentage_val = df.filter(selected_col.isNull()).count() / rdd_count * 100
        nan_percentages.append(none_percentage_val)
        report_data.append([col_name, dtype, unique_sample, n_unique, none_percentage_val])

    return report_data

In [10]:
report_res = report_df(loan_df, loan_df.columns)


LoanNr_ChkDgt
Name
City
State
Zip
Bank
BankState
NAICS
ApprovalDate
ApprovalFY
Term
NoEmp
NewExist
CreateJob
RetainedJob
FranchiseCode
UrbanRural
RevLineCr
LowDoc
ChgOffDate
DisbursementDate
DisbursementGross
BalanceGross
MIS_Status
ChgOffPrinGr
GrAppv
SBA_Appv


In [11]:

# Display the result
column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
print(tabulate(report_res, headers=column_names, tablefmt='grid'))

+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Column            | Type   | Unique Sample                                                       |   N Unique |        %None |
| LoanNr_ChkDgt     | bigint | [1000895005, 1001055002]                                            |     899164 |  0           |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| Name              | string | ['TURTLE BEACH INN', 'URBAN BEAST-SEATTLE LLC']                     |     779587 |  0.000333643 |
+-------------------+--------+---------------------------------------------------------------------+------------+--------------+
| City              | string | ['Worcester', 'West Sand Lake']                                     |      32582 |  0.00333643  |
+-------------------+--------+-------------------------------------------------------------------

In [16]:
def show_percentage_of_each_value_in_column(df, df_count,col_name):
    # Calculate percentage of 0s and 1s
    percentage_df = df.groupBy(col_name).agg((count("*") / df_count).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # sort the dataframe by percentage descending
    percentage_df = percentage_df.sort(col("Percentage").desc())

    # Show result
    percentage_df.show()

### 1. LoanNr_ChkDgt - ID
Drop the column as it is an ID column and does not provide any information for the analysis.

In [7]:
loan_df = loan_df.drop('LoanNr_ChkDgt')

### 2. Name - Name of Borrower
Drop the column as it is a name column and does not provide any information for the analysis.

In [17]:
col_name = 'Name'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)


+--------------------+----------+
|                Name|Percentage|
+--------------------+----------+
|              SUBWAY|      0.14|
|       QUIZNO'S SUBS|      0.05|
|         DAIRY QUEEN|      0.04|
|      DOMINO'S PIZZA|      0.04|
| COLD STONE CREAMERY|      0.04|
|       THE UPS STORE|      0.04|
|            QUIZNO'S|      0.04|
|            DAYS INN|      0.03|
|       DUNKIN DONUTS|      0.03|
|         MATCO TOOLS|      0.03|
|PLAY IT AGAIN SPORTS|      0.03|
|       SUPER 8 MOTEL|      0.03|
|      MAIL BOXES ETC|      0.03|
|QUIZNO'S CLASSIC ...|      0.03|
|     MAIL BOXES ETC.|      0.02|
|     MINUTEMAN PRESS|      0.02|
|            WINGSTOP|      0.02|
|         COMFORT INN|      0.02|
|    CURVES FOR WOMEN|      0.02|
|         GREAT CLIPS|      0.02|
+--------------------+----------+
only showing top 20 rows



In [18]:
# percentage of unique values in the city
city_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in City: {city_unique_count}")
city_percentage = city_unique_count / loan_df_count * 100
print(f"Percentage of unique values in City: {city_percentage:.2f}%")

Number of unique values in City: 779558
Percentage of unique values in City: 86.70%


Drop as most of the names are unique

In [19]:
loan_df = loan_df.drop('Name')

### 3. City - City of Borrower


In [20]:
# Count the occurrences of each value in city column
col_name = 'City'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------------+----------+
|          City|Percentage|
+--------------+----------+
|   LOS ANGELES|      1.29|
|       HOUSTON|      1.14|
|      NEW YORK|      0.87|
|       CHICAGO|      0.67|
|         MIAMI|      0.62|
|     SAN DIEGO|       0.6|
|        DALLAS|      0.57|
|       PHOENIX|       0.5|
|     LAS VEGAS|      0.49|
|   SPRINGFIELD|      0.42|
|      BROOKLYN|      0.41|
|   SAN ANTONIO|      0.39|
|        AUSTIN|      0.39|
|        DENVER|      0.39|
|       SEATTLE|      0.39|
|SALT LAKE CITY|      0.39|
| SAN FRANCISCO|      0.37|
|      PORTLAND|      0.36|
|       ATLANTA|      0.35|
|      COLUMBUS|      0.35|
+--------------+----------+
only showing top 20 rows



In [21]:
# percentage of unique values in the city
city_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in City: {city_unique_count}")
city_percentage = city_unique_count / loan_df_count * 100
print(f"Percentage of unique values in City: {city_percentage:.2f}%")

Number of unique values in City: 32581
Percentage of unique values in City: 3.62%


In [22]:
unique_city_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_city_df.show()

+--------------+-----+
|          City|Count|
+--------------+-----+
|   LOS ANGELES|11558|
|       HOUSTON|10247|
|      NEW YORK| 7846|
|       CHICAGO| 6036|
|         MIAMI| 5594|
|     SAN DIEGO| 5363|
|        DALLAS| 5085|
|       PHOENIX| 4493|
|     LAS VEGAS| 4390|
|   SPRINGFIELD| 3738|
|      BROOKLYN| 3728|
|        DENVER| 3550|
|   SAN ANTONIO| 3515|
|SALT LAKE CITY| 3511|
|        AUSTIN| 3499|
|       SEATTLE| 3470|
| SAN FRANCISCO| 3365|
|      PORTLAND| 3193|
|      COLUMBUS| 3186|
|  PHILADELPHIA| 3178|
+--------------+-----+
only showing top 20 rows



In [23]:
# delete null values
loan_df = loan_df.dropna(subset=[col_name])


### 4. State - State of Borrower

In [24]:
col_name = 'State'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|State|Percentage|
+-----+----------+
|   CA|     14.53|
|   TX|      7.84|
|   NY|      6.42|
|   FL|      4.58|
|   PA|      3.91|
|   OH|      3.63|
|   IL|       3.3|
|   MA|      2.81|
|   MN|      2.71|
|   NJ|      2.67|
|   WA|      2.59|
|   GA|      2.48|
|   WI|      2.34|
|   MO|       2.3|
|   CO|      2.29|
|   MI|      2.28|
|   UT|      2.09|
|   AZ|      1.96|
|   NC|      1.59|
|   IN|      1.57|
+-----+----------+
only showing top 20 rows



In [25]:
# percentage of unique values in the city
state_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in State: {state_unique_count}")
state_percentage = state_unique_count / loan_df_count * 100
print(f"Percentage of unique values in State: {state_percentage:.2f}%")

Number of unique values in State: 52
Percentage of unique values in State: 0.01%


In [26]:
# delete nulls
loan_df = loan_df.dropna(subset=[col_name])


### 5. Zip - Zip code of Borrower

In [27]:
col_name = 'Zip'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|  Zip|Percentage|
+-----+----------+
|10001|       0.1|
|90015|       0.1|
|93401|      0.09|
|90010|      0.08|
|33166|      0.07|
|90021|      0.07|
|59601|      0.07|
|65804|      0.07|
|84107|      0.06|
|85260|      0.06|
|92069|      0.06|
|84115|      0.06|
|90670|      0.06|
|11354|      0.06|
|92121|      0.06|
|89102|      0.06|
|92101|      0.06|
| 3801|      0.06|
|59101|      0.06|
|59102|      0.06|
+-----+----------+
only showing top 20 rows



In [28]:
# percentage of unique values in the city
zip_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {zip_unique_count}")
zip_percentage = zip_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {zip_percentage:.2f}%")

Number of unique values in Zip: 33610
Percentage of unique values in Zip: 3.74%


In [29]:
unique_zip_df = loan_df.select(col_name).groupBy(col_name).agg((count("*")).alias("Count")).sort(col("Count").desc())
unique_zip_df.show()

+-----+-----+
|  Zip|Count|
+-----+-----+
|10001|  933|
|90015|  926|
|93401|  806|
|90010|  733|
|33166|  671|
|90021|  666|
|59601|  640|
|65804|  599|
| 3801|  581|
|59101|  578|
|84115|  577|
|92121|  567|
|92101|  565|
|90670|  531|
|11354|  530|
|59102|  519|
|85260|  518|
|84107|  513|
|89102|  500|
|92069|  499|
+-----+-----+
only showing top 20 rows



In [30]:
loan_df = loan_df.drop("col_name")

### 6. Bank - Name of the bank that gave the loan

In [31]:
col_name = 'Bank'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)
loan_df = loan_df.dropna(subset=[col_name])

# percentage of unique values in the city
bank_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {bank_unique_count}")
bank_percentage = bank_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {bank_percentage:.2f}%")

+--------------------+----------+
|                Bank|Percentage|
+--------------------+----------+
|BANK OF AMERICA N...|      9.66|
|WELLS FARGO BANK ...|      7.06|
|JPMORGAN CHASE BA...|      5.36|
|U.S. BANK NATIONA...|      3.91|
|CITIZENS BANK NAT...|       3.9|
|PNC BANK, NATIONA...|      3.04|
|           BBCN BANK|      2.56|
|CAPITAL ONE NATL ...|      2.47|
|MANUFACTURERS & T...|      1.25|
|READYCAP LENDING,...|      1.19|
|THE HUNTINGTON NA...|      1.07|
|KEYBANK NATIONAL ...|      1.03|
|TD BANK, NATIONAL...|       1.0|
|  BRANCH BK. & TR CO|      0.92|
|ZIONS FIRST NATIO...|      0.88|
|CALIFORNIA BANK &...|      0.84|
|      CITIBANK, N.A.|      0.83|
|        REGIONS BANK|      0.81|
|BANCO POPULAR NOR...|       0.8|
|       COMERICA BANK|      0.78|
+--------------------+----------+
only showing top 20 rows

Number of unique values in Zip: 5801
Percentage of unique values in Zip: 0.65%


### 7. BankState - State of Bank

In [32]:
col_name = 'BankState'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)
loan_df = loan_df.dropna(subset=[col_name])

# percentage of unique values in the city
bankState_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {bankState_unique_count}")
bankState_percentage = bankState_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {bankState_percentage:.2f}%")

+---------+----------+
|BankState|Percentage|
+---------+----------+
|       CA|     13.14|
|       NC|      8.84|
|       IL|      7.33|
|       OH|       6.5|
|       SD|      5.68|
|       TX|      5.31|
|       RI|      5.05|
|       NY|       4.4|
|       VA|      3.23|
|       DE|      2.73|
|       MN|      2.22|
|       UT|      2.11|
|       PA|       1.9|
|       WI|      1.71|
|       MO|      1.65|
|       MA|      1.55|
|       GA|      1.54|
|       FL|      1.54|
|       AL|      1.35|
|       OR|      1.26|
+---------+----------+
only showing top 20 rows

Number of unique values in Zip: 56
Percentage of unique values in Zip: 0.01%


### 8. NAICS - North American Industry Classification System code for the industry where the business is located

In [33]:
col_name='NAICS'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
| NAICS|Percentage|
+------+----------+
|     0|     22.35|
|722110|      3.11|
|722211|      2.16|
|811111|      1.62|
|621210|      1.56|
|624410|      1.12|
|812112|      1.03|
|561730|      0.99|
|621310|      0.97|
|812320|      0.88|
|541110|      0.79|
|445310|      0.76|
|621111|      0.75|
|721110|      0.75|
|447110|      0.74|
|713940|      0.73|
|811121|       0.7|
|453220|      0.67|
|451110|      0.67|
|484110|      0.65|
+------+----------+
only showing top 20 rows



In [34]:
# percentage of unique values in the city
zip_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {zip_unique_count}")
zip_percentage = zip_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {zip_percentage:.2f}%")

Number of unique values in Zip: 1312
Percentage of unique values in Zip: 0.15%


In [35]:
# # Convert NAICS code into related sector

# Extract first two characters of NAICS code
first_two_chars = substring(loan_df["NAICS"], 1, 2)
# print(first_two_chars)[0]

# Apply mapping using when and otherwise
loan_df = loan_df.withColumn("Sector",
    first_two_chars
)
loan_df = loan_df.drop("NAICS")
col_name='Sector'
# # Function to get the sector for a given NAICS code
# def get_sector(naics_code):
#     return naics_to_sector[str(naics_code)]

# # Register UDF
# get_sector_udf = udf(get_sector, StringType())

# # Apply UDF to create a new column 'Sector'
# loan_df = loan_df.withColumn("Sector", get_sector_udf(loan_df["Sector"]))
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
|Sector|Percentage|
+------+----------+
|     0|     22.35|
|    44|      9.41|
|    81|      8.07|
|    54|      7.58|
|    72|      7.52|
|    23|       7.4|
|    62|      6.15|
|    42|      5.41|
|    45|      4.72|
|    33|      4.25|
|    56|      3.63|
|    48|      2.26|
|    32|      1.99|
|    71|      1.63|
|    53|      1.51|
|    31|      1.31|
|    51|      1.26|
|    52|      1.06|
|    11|       1.0|
|    61|      0.71|
+------+----------+
only showing top 20 rows



In [36]:
naics_to_sector = {
    '11': 'Agriculture, Forestry, Fishing and Hunting',
    '21': 'Mining, Quarrying, and Oil and Gas Extraction',
    '22': 'Utilities',
    '23': 'Construction',
    '31-33': 'Manufacturing',
    '42': 'Wholesale Trade',
    '44-45': 'Retail Trade',
    '48-49': 'Transportation and Warehousing',
    '51': 'Information',
    '52': 'Finance and Insurance',
    '53': 'Real Estate and Rental and Leasing',
    '54': 'Professional, Scientific, and Technical Services',
    '55': 'Management of Companies and Enterprises',
    '56': 'Administrative and Support and Waste Management and Remediation Services',
    '61': 'Educational Services',
    '62': 'Health Care and Social Assistance',
    '71': 'Arts, Entertainment, and Recreation',
    '72': 'Accommodation and Food Services',
    '81': 'Other Services (except Public Administration)',
    '92': 'Public Administration'
}
col_name='Sector'

loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == 32, 31)
                   .when(col(col_name) == 33, 31)
                   .when(col(col_name) == 45, 44)
                   .when(col(col_name) == 49, 48))

# loan_df = loan_df.withColumn(col_name, 
#                    when(col(col_name) == "Y", 1)
#                    .otherwise(0)
#                    .cast("int"))

In [37]:
loan_df.show(5)

+------------+-----+-----+--------------------+---------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+------+
|        City|State|  Zip|                Bank|BankState|ApprovalDate|ApprovalFY|Term|NoEmp|NewExist|CreateJob|RetainedJob|FranchiseCode|UrbanRural|RevLineCr|LowDoc|ChgOffDate|DisbursementDate|DisbursementGross|BalanceGross|MIS_Status|ChgOffPrinGr|      GrAppv|    SBA_Appv|Sector|
+------------+-----+-----+--------------------+---------+------------+----------+----+-----+--------+---------+-----------+-------------+----------+---------+------+----------+----------------+-----------------+------------+----------+------------+------------+------------+------+
|  EVANSVILLE|   IN|47711|    FIFTH THIRD BANK|       OH|   28-Feb-97|      1997|  84|    4|       2|        0|          0|            1|         0|      

### 9. ApprovalDate - Date SBA commitment issued

In [38]:
col_name = 'ApprovalDate'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|ApprovalDate|Percentage|
+------------+----------+
|    7-Jul-93|      0.12|
|   30-Jan-04|      0.11|
|    8-Jul-93|      0.09|
|   30-Sep-03|      0.07|
|    4-Oct-04|      0.07|
|   21-Jan-05|      0.06|
|   30-Jun-05|      0.06|
|   18-Apr-05|      0.06|
|    6-Jul-93|      0.06|
|   27-Sep-02|      0.06|
|    7-Nov-03|      0.05|
|   24-Mar-06|      0.05|
|   22-Aug-03|      0.05|
|    6-Dec-04|      0.05|
|   21-Feb-03|      0.05|
|    9-Feb-05|      0.05|
|   14-Nov-03|      0.05|
|   18-Jun-04|      0.05|
|   30-Sep-02|      0.05|
|   23-Mar-95|      0.05|
+------------+----------+
only showing top 20 rows



In [39]:
# the full date has too much detail, so we will extract the month only
# first, remove the nulls
col_name = 'ApprovalDate'
print(f"Number of rows before removing nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing nulls: {loan_df_count}")
# split on '-', get the second element
loan_df = loan_df.withColumn("ApprovalMonth", split(col(col_name), "-")[1])
loan_df = loan_df.drop(col_name)
col_name = 'ApprovalMonth'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,"ApprovalMonth")

Number of rows before removing nulls: 899164
Number of rows after removing nulls: 897557
+-------------+----------+
|ApprovalMonth|Percentage|
+-------------+----------+
|          Mar|       9.3|
|          Sep|      9.24|
|          Apr|      8.92|
|          Aug|      8.76|
|          Jun|      8.71|
|          May|      8.59|
|          Jul|       8.5|
|          Dec|      7.78|
|          Oct|      7.76|
|          Nov|      7.61|
|          Jan|      7.46|
|          Feb|      7.38|
+-------------+----------+



### 10. ApprovalFY - Fiscal Year of commitment
Drop the column as it is a date column and does not provide any information for the analysis.

In [40]:
loan_df = loan_df.drop('ApprovalFY')

### 11. Term - Loan term in months

In [41]:
col_name = 'Term'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----+----------+
|Term|Percentage|
+----+----------+
|  84|     25.61|
|  60|      9.99|
| 240|      9.56|
| 120|      8.63|
| 300|      4.98|
| 180|      3.12|
|  36|       2.2|
|  12|       1.9|
|  48|      1.74|
|  72|      1.05|
|  24|      0.83|
|  96|      0.79|
|  90|      0.76|
|  66|      0.58|
|  87|      0.55|
|  63|      0.51|
|  83|      0.46|
| 108|      0.44|
| 144|      0.36|
|   6|      0.34|
+----+----------+
only showing top 20 rows



In [42]:
# percentage of unique values in the city
term_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {term_unique_count}")
term_percentage = term_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {term_percentage:.2f}%")

Number of unique values in Zip: 412
Percentage of unique values in Zip: 0.05%


In [43]:
loan_df = loan_df.withColumn("Term_category", 
                             when((col(col_name) <=90),'Below 3 months')
                             .when(((col(col_name)>90) & (col(col_name)<=180)), '3-6 months')
                             .when(((col(col_name)>180) & (col(col_name)<=365)),  '6-12 months')
                             .otherwise('More Than a Year'))
loan_df = loan_df.drop(col_name)

In [44]:
col_name = "Term_category"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----------------+----------+
|   Term_category|Percentage|
+----------------+----------+
|  Below 3 months|     64.01|
|     6-12 months|     18.48|
|      3-6 months|     17.51|
|More Than a Year|      0.01|
+----------------+----------+



### 12. NoEmp - Number of Business Employees

In [45]:
col_name = 'NoEmp'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----+----------+
|NoEmp|Percentage|
+-----+----------+
|    1|     17.17|
|    2|     15.39|
|    3|     10.09|
|    4|      8.19|
|    5|      6.71|
|    6|      5.09|
|   10|      3.51|
|    7|       3.5|
|    8|      3.49|
|   12|      2.31|
|   15|      2.04|
|    9|      2.02|
|   20|      1.59|
|   11|      1.31|
|   14|      1.19|
|   25|       1.1|
|   13|      1.04|
|   30|      0.96|
|   16|      0.87|
|   18|      0.87|
+-----+----------+
only showing top 20 rows



In [46]:
# percentage of unique values in the city
zip_unique_count = loan_df.select(col_name).distinct().count()
print(f"Number of unique values in Zip: {zip_unique_count}")
zip_percentage = zip_unique_count / loan_df_count * 100
print(f"Percentage of unique values in Zip: {zip_percentage:.2f}%")

Number of unique values in Zip: 599
Percentage of unique values in Zip: 0.07%


### 13. NewExist - 1 = Existing business, 2 = New business 

In [47]:
col_name = 'NewExist'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.71|
|       2|     28.16|
|       0|      0.11|
|    NULL|      0.02|
+--------+----------+



Drop rows with 0 or Null

In [48]:
col_name = 'NewExist'
print(f"Number of rows before removing 0s and nulls: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name] != 0)
loan_df = loan_df.filter(loan_df[col_name].isNotNull())
loan_df_count = loan_df.count()
print(f"Number of rows after removing 0s and nulls: {loan_df_count}")

Number of rows before removing 0s and nulls: 897557
Number of rows after removing 0s and nulls: 896390


In [49]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       1|     71.81|
|       2|     28.19|
+--------+----------+



In [50]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "2", 1)
                   .otherwise(0)
                   .cast("int"))

In [51]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+--------+----------+
|NewExist|Percentage|
+--------+----------+
|       0|     71.81|
|       1|     28.19|
+--------+----------+



### 14. CreateJob - Number of jobs created

In [52]:
col_name='CreateJob'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+---------+----------+
|CreateJob|Percentage|
+---------+----------+
|        0|     69.92|
|        1|      7.04|
|        2|      6.45|
|        3|      3.21|
|        4|      2.29|
|        5|      2.08|
|       10|      1.29|
|        6|      1.23|
|        8|      0.82|
|        7|      0.71|
|       15|       0.6|
|       20|       0.5|
|       12|      0.48|
|        9|      0.37|
|       25|      0.26|
|       11|      0.23|
|       30|      0.23|
|       14|      0.21|
|       13|       0.2|
|       16|      0.16|
+---------+----------+
only showing top 20 rows



In [53]:
# loan_df = loan_df.drop(col_name)

### 15. RetainedJob - Number of jobs retained

In [54]:
col_name='RetainedJob'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----------+----------+
|RetainedJob|Percentage|
+-----------+----------+
|          0|     48.85|
|          1|       9.9|
|          2|      8.57|
|          3|      5.57|
|          4|      4.42|
|          5|      3.64|
|          6|      2.65|
|          7|      1.84|
|          8|      1.75|
|         10|      1.72|
|         12|      1.07|
|          9|      0.97|
|         15|      0.94|
|         20|       0.7|
|         11|      0.63|
|         14|      0.53|
|         13|       0.5|
|         25|      0.46|
|         16|      0.38|
|         18|      0.38|
+-----------+----------+
only showing top 20 rows



In [55]:
loan_df = loan_df.drop(col_name)

### 16. FranchiseCode - Franchise code, (00000 or 00001) = No franchise

In [56]:
col_name='FranchiseCode'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-------------+----------+
|FranchiseCode|Percentage|
+-------------+----------+
|            1|     71.02|
|            0|     23.22|
|        78760|      0.37|
|        68020|      0.21|
|        50564|      0.11|
|        21780|      0.11|
|        25650|      0.08|
|        22470|      0.07|
|        79140|      0.07|
|        17998|      0.07|
|        52000|      0.06|
|        24850|      0.06|
|        10528|      0.05|
|        67750|      0.05|
|        21420|      0.04|
|        72590|      0.04|
|        10465|      0.04|
|        52875|      0.04|
|        38605|      0.04|
|         9050|      0.04|
+-------------+----------+
only showing top 20 rows



We don't care about the franchise code, we only care if there is a franchise or not

In [57]:
# make 0 or 1 = 0, anything else = 1
loan_df = loan_df.withColumn("IsFranchise", when((col(col_name) == 0) | (col(col_name) == 1), 0).otherwise(1))

In [58]:
col_name = 'IsFranchise'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+-----------+----------+
|IsFranchise|Percentage|
+-----------+----------+
|          0|     94.24|
|          1|      5.76|
+-----------+----------+



In [59]:
loan_df = loan_df.drop('FranchiseCode')

### 17. UrbanRural - 1 = Urban, 2 = rural, 0 = undefined

In [60]:
col_name = 'UrbanRural'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+----------+----------+
|UrbanRural|Percentage|
+----------+----------+
|         1|     52.45|
|         0|     35.82|
|         2|     11.73|
+----------+----------+



### 18. RevLineCr - Revolving line of credit: Y = Yes, N = No

In [61]:
col_name = 'RevLineCr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|      46.6|
|        0|     28.73|
|        Y|     22.46|
|        T|       1.7|
|     NULL|       0.5|
|        7|       0.0|
|        3|       0.0|
|        Q|       0.0|
|        5|       0.0|
|        .|       0.0|
|        C|       0.0|
|        -|       0.0|
|        A|       0.0|
|        R|       0.0|
|        1|       0.0|
|        `|       0.0|
|        ,|       0.0|
|        4|       0.0|
|        2|       0.0|
+---------+----------+



Filter only N and Y

In [62]:
col_name = 'RevLineCr'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 896390
Number of rows after filtering: 619006
+---------+----------+
|RevLineCr|Percentage|
+---------+----------+
|        N|     67.48|
|        Y|     32.52|
+---------+----------+



Transform N and Y to 0 and 1

In [63]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "Y", 1)
                   .otherwise(0)
                   .cast("int"))

### 19. LowDoc - LowDoc Loan Program: Y = Yes, N = No

In [64]:
col_name = "LowDoc"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     89.93|
|     Y|      9.24|
|  NULL|      0.41|
|     0|      0.19|
|     C|      0.11|
|     S|      0.09|
|     A|      0.02|
|     R|      0.01|
|     1|       0.0|
+------+----------+



Filter only N and Y

In [65]:
col_name = 'LowDoc'
print(f"Number of rows before filtering: {loan_df_count}")
loan_df = loan_df.filter(loan_df[col_name].isin('N', 'Y'))
loan_df_count = loan_df.count()
print(f"Number of rows after filtering: {loan_df_count}")
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

Number of rows before filtering: 619006
Number of rows after filtering: 613908
+------+----------+
|LowDoc|Percentage|
+------+----------+
|     N|     90.68|
|     Y|      9.32|
+------+----------+



Transform N and Y to 0 and 1

In [66]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "Y", 1)
                   .otherwise(0)
                   .cast("int"))

### 20. ChgOffDate - The date when a loan is declared to be in default
Drop the column due to the high number of missing values.

In [67]:
loan_df = loan_df.drop('ChgOffDate')

### 21. DisbursementDate - Date when loan was disbursed

In [68]:
loan_df = loan_df.drop('DisbursementDate')

### 22. DisbursementGross - Amount disbursed

In [69]:
loan_df = loan_df.drop('DisbursementGross')

### 23. BalanceGross - Gross amount outstanding

In [70]:
col_name = 'BalanceGross'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|BalanceGross|Percentage|
+------------+----------+
|      $0.00 |     100.0|
| $25,000.00 |       0.0|
|  $1,760.00 |       0.0|
| $84,617.00 |       0.0|
| $37,100.00 |       0.0|
|$827,875.00 |       0.0|
| $43,127.00 |       0.0|
|$996,262.00 |       0.0|
+------------+----------+



Drop as most of the values are 0

In [71]:
loan_df = loan_df.drop('BalanceGross')

### 24. MIS_Status - Target variable

Delete rows that have null target value (MIS_Status)

In [72]:
col_name ="MIS_Status"
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.66|
|    CHGOFF|     18.06|
|      NULL|      0.28|
+----------+----------+



In [73]:
# drop rows with null values in MIS_Status column
loan_df = loan_df.dropna(subset=[col_name])
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)


+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|     P I F|     81.66|
|    CHGOFF|     18.06|
+----------+----------+



### Replace target values with 0 and 1
Target value column is: MIS_Status
"P I F" = 1
"CHGOFF" = 0

In [74]:
loan_df = loan_df.withColumn(col_name, 
                   when(col(col_name) == "P I F", 1)
                   .otherwise(0)
                   .cast("int"))

Show the percentage of:
- Paid in full loans (approved loans), MIS_Status = 1
- Charged off loans (rejected loans), MIS_Status = 0

In [75]:
show_percentage_of_each_value_in_column(loan_df, loan_df_count, col_name)

+----------+----------+
|MIS_Status|Percentage|
+----------+----------+
|         1|     81.66|
|         0|     18.06|
+----------+----------+



### 25. ChgOffPrinGr - Charged-off amount

In [76]:
col_name = 'ChgOffPrinGr'
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|ChgOffPrinGr|Percentage|
+------------+----------+
|      $0.00 |     81.25|
| $50,000.00 |      0.29|
| $10,000.00 |      0.22|
| $25,000.00 |      0.21|
| $35,000.00 |      0.18|
|$100,000.00 |      0.14|
| $20,000.00 |      0.08|
| $30,000.00 |      0.07|
| $15,000.00 |      0.06|
|  $5,000.00 |      0.05|
| $75,000.00 |      0.04|
| $40,000.00 |      0.03|
| $45,000.00 |      0.02|
| $24,999.00 |      0.02|
| $28,000.00 |      0.01|
| $12,500.00 |      0.01|
| $95,000.00 |      0.01|
| $24,000.00 |      0.01|
| $49,999.00 |      0.01|
|$250,000.00 |      0.01|
+------------+----------+
only showing top 20 rows



Drop this column as it will leak info to the column, because if the value is 0, this means that the loan is charged off

In [77]:
loan_df = loan_df.drop('ChgOffPrinGr')

### 26. GrAppv - Gross amount of loan approved by bank

In [78]:
col_name = "GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|      GrAppv|Percentage|
+------------+----------+
| $50,000.00 |      8.79|
| $25,000.00 |      6.95|
|$100,000.00 |      6.09|
| $10,000.00 |      4.94|
| $20,000.00 |      3.15|
| $35,000.00 |      3.01|
| $30,000.00 |      2.65|
| $15,000.00 |      2.37|
|$150,000.00 |      2.32|
| $75,000.00 |      2.16|
| $40,000.00 |      1.75|
|  $5,000.00 |      1.61|
|$200,000.00 |      1.48|
|$250,000.00 |      1.33|
| $60,000.00 |      1.19|
|$300,000.00 |      0.91|
| $80,000.00 |      0.88|
|$500,000.00 |      0.86|
| $45,000.00 |      0.85|
| $70,000.00 |      0.78|
+------------+----------+
only showing top 20 rows



#### Clean this column
- Remove $
- Remove ,
- Convert to float

In [79]:
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("GrAppv", "\$", ""))  # Remove $
loan_df = loan_df.withColumn("clean_GrAppv", regexp_replace("clean_GrAppv", ",", ""))  # Remove comma
loan_df = loan_df.withColumn("clean_GrAppv", col("clean_GrAppv").cast("float"))
col_name = "clean_GrAppv"
show_percentage_of_each_value_in_column(loan_df, loan_df_count,col_name)

+------------+----------+
|clean_GrAppv|Percentage|
+------------+----------+
|     50000.0|      8.79|
|     25000.0|      6.95|
|    100000.0|      6.09|
|     10000.0|      4.94|
|     20000.0|      3.15|
|     35000.0|      3.01|
|     30000.0|      2.65|
|     15000.0|      2.37|
|    150000.0|      2.32|
|     75000.0|      2.16|
|     40000.0|      1.75|
|      5000.0|      1.61|
|    200000.0|      1.48|
|    250000.0|      1.33|
|     60000.0|      1.19|
|    300000.0|      0.91|
|     80000.0|      0.88|
|    500000.0|      0.86|
|     45000.0|      0.85|
|     70000.0|      0.78|
+------------+----------+
only showing top 20 rows



In [80]:
loan_df = loan_df.drop('GrAppv')

### 27. SBA_Appv - SBA's guaranteed amount of approved loan
Drop as we don't know this amount in the future

In [81]:
loan_df = loan_df.drop('SBA_Appv')

### Final schema

In [82]:
loan_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = false)
 |-- CreateJob: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: integer (nullable = false)
 |-- LowDoc: integer (nullable = false)
 |-- MIS_Status: integer (nullable = false)
 |-- Sector: integer (nullable = true)
 |-- ApprovalMonth: string (nullable = true)
 |-- Term_category: string (nullable = false)
 |-- IsFranchise: integer (nullable = false)
 |-- clean_GrAppv: float (nullable = true)



### Check duplicated rows based on all columns


In [83]:
print("Number of duplicate rows in the dataframe:")
loan_df_duplicates = loan_df_count - loan_df.dropDuplicates().count()
print(loan_df_duplicates)

Number of duplicate rows in the dataframe:


2729


### Final DF Count

In [84]:
loan_df_count = loan_df.count()
print(f"Final DF count: {loan_df_count}")

Final DF count: 612202


In [71]:
report_res = report_df(loan_df, loan_df.columns)
# Display the result
column_names = ['Column', 'Type', 'Unique Sample', 'N Unique', '%None']
print(tabulate(report_res, headers=column_names, tablefmt='grid'))

City
State
Zip
Bank
BankState
Term
NoEmp
NewExist
UrbanRural
RevLineCr
LowDoc
MIS_Status
Sector
ApprovalMonth
IsFranchise
clean_GrAppv
+---------------+--------+---------------------------------------------------------------------+------------+---------+
| Column        | Type   | Unique Sample                                                       |   N Unique |   %None |
| City          | string | ['Fairbanks', 'NESCONSET']                                          |      28428 |       0 |
+---------------+--------+---------------------------------------------------------------------+------------+---------+
| State         | string | ['SC', 'AZ']                                                        |         51 |       0 |
+---------------+--------+---------------------------------------------------------------------+------------+---------+
| Zip           | int    | [47711, 4101]                                                       |      30178 |       0 |
+---------------+--------

# Exploratory Data Analysis (EDA)

In [110]:
def feature_vs_target(df, feature):
    target = "MIS_Status"
    # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
    percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # Show result
    percentage_df.show()

    # Convert DataFrame to list
    data = percentage_df.collect()

    # Separate data by target
    data_0 = [row['Percentage'] for row in data if row[target] == 0]
    data_1 = [row['Percentage'] for row in data if row[target] == 1]

    # Create traces
    trace0 = go.Bar(x=[row[feature] for row in data if row[target] == 0], y=data_0, name='0', marker_color='red')
    trace1 = go.Bar(x=[row[feature] for row in data if row[target] == 1], y=data_1, name='1', marker_color='blue')

    # Create layout
    layout = go.Layout(barmode='stack', title='Percentage Distribution of ' + feature + ' vs ' + target)

    # Create figure and add traces
    fig = go.Figure(data=[trace0, trace1], layout=layout)

    # Plot
    fig.show()

def features_vs_target(df, features):
    target = "MIS_Status"
    num_cols = 4
    # Calculate the number of rows needed for the grid
    num_rows = (len(features) // num_cols) + 1  # Ceiling division to get the number of rows needed

    # Create a subplot grid with four columns
    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[f"{feat} vs {target}" for feat in features])

    # Initialize row and col counters
    row_idx = 1
    col_idx = 1

    for feature in features:
        # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
        percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("Percentage"))

        # Round percentage values to two decimal places
        percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

        # Convert DataFrame to list
        data = percentage_df.collect()

        # Separate data by target
        data_0 = [row['Percentage'] for row in data if row[target] == 0]
        data_1 = [row['Percentage'] for row in data if row[target] == 1]

        # Create traces for the current feature
        trace0 = go.Bar(x=[row[feature] for row in data if row[target] == 0], y=data_0, name='0', marker_color='red')
        trace1 = go.Bar(x=[row[feature] for row in data if row[target] == 1], y=data_1, name='1', marker_color='blue')

        # Add traces to the subplot
        fig.add_trace(trace0, row=row_idx, col=col_idx)
        fig.add_trace(trace1, row=row_idx, col=col_idx)

        # Move to the next cell
        col_idx += 1
        if col_idx > num_cols:
            col_idx = 1
            row_idx += 1

    # Update layout
    fig.update_layout(height=600*num_rows, title_text=f"Percentage Distribution of Features vs {target}", showlegend=False)

    # Plot
    fig.show()

In [None]:
features = loan_df.columns
features_vs_target(loan_df, features)

### 9. ApprovalMonth

Drop approval month as it is needed only in EDA not training

In [77]:
loan_df =loan_df.drop("ApprovalMonth")

## Machine Learning

In [83]:
print("Transforming categorial features...")
# List of categorical columns to be one-hot encoded
categorical_columns = ["LowDoc", "RevLineCr","BankState","Bank", "State", "City", "UrbanRural", "Sector", "Term_category"]

# Define an empty list to store the pipeline stages
stages = []

# Iterate over each categorical column
for column in categorical_columns:
    # Define StringIndexer for the current column
    indexer = StringIndexer(inputCol=column, outputCol=column + "Index")
    
    # Define OneHotEncoder for the indexed column
    encoder = OneHotEncoder(inputCol=column + "Index", outputCol=column + "Vec")
    
    # Add StringIndexer and OneHotEncoder to the list of stages
    stages += [indexer, encoder]
label_column = "MIS_Status"

# Create VectorAssembler for combining all features
# List of input columns (excluding the label column and categorical columns)
input_columns = [col for col in loan_df.columns if col != label_column and col not in categorical_columns]
input_columns += [column + "Vec" for column in categorical_columns]
assembler = VectorAssembler(inputCols=input_columns , outputCol="features")

# Combine all stages into a Pipeline
pipeline = Pipeline(stages=stages + [assembler])

# Fit the pipeline to your data
pipeline_model = pipeline.fit(loan_df)

# Transform your data using the pipeline
transformed_data = pipeline_model.transform(loan_df)
transformed_data.show(5)
print("Splitting data into training, validation and test...")
# Split the transformed data into training and test sets (70% training, 30% test)
# (trainingData, testData) = transformed_data.randomSplit([0.7, 0.3])
(trainingData, validationData, testData) = transformed_data.randomSplit([0.6, 0.2, 0.2], seed=123)


# Create a Logistic Regression model
lr = LogisticRegression(maxIter=10, elasticNetParam=0.8, labelCol=label_column, featuresCol="features")
print("Training logistic regression model...")
# Train the model
lrModel = lr.fit(trainingData)

Transforming categorial features...
+------------+-----+-----+--------------------+---------+-----+--------+---------+----------+---------+------+----------+------+--------------+-----------+------------+-----------+-------------+--------------+-------------+--------------+---------------+---------+-------------------+----------+---------------+---------+--------------------+---------------+-------------+-----------+--------------+------------------+----------------+--------------------+
|        City|State|  Zip|                Bank|BankState|NoEmp|NewExist|CreateJob|UrbanRural|RevLineCr|LowDoc|MIS_Status|Sector| Term_category|IsFranchise|clean_GrAppv|LowDocIndex|    LowDocVec|RevLineCrIndex| RevLineCrVec|BankStateIndex|   BankStateVec|BankIndex|            BankVec|StateIndex|       StateVec|CityIndex|             CityVec|UrbanRuralIndex|UrbanRuralVec|SectorIndex|     SectorVec|Term_categoryIndex|Term_categoryVec|            features|
+------------+-----+-----+--------------------+---

In [85]:
# Make predictions on the test data
predictions = lrModel.transform(validationData)

# predictions.describe().show()
# Evaluate the model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol=label_column)
accuracy = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)

Accuracy: 0.6068017344509256


In [87]:
def evaluate_model(model, data, model_name , date_type):

    # prdict on data
    predictions = model.transform(data)

    # Create evaluators for different metrics
    evaluator_multi = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='accuracy')
    evaluator_weighted_precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedPrecision')
    evaluator_weighted_recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='f1')

    # Calculate evaluation metrics
    accuracy = evaluator_multi.evaluate(predictions)
    weighted_precision = evaluator_weighted_precision.evaluate(predictions)
    weighted_recall = evaluator_weighted_recall.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)

    # Print results
    print('-------------------------------------------------------------------------------------------------------------------')
    print(f'---------------------------------------------- Model: {model_name} -----------------------------------------------')
    print('-------------------------------------------------------------------------------------------------------------------')
    print(f'Data Type: {date_type}')
    print(f'Accuracy: {accuracy}')
    print(f'Weighted Precision: {weighted_precision}')
    print(f'Weighted Recall: {weighted_recall}')
    print(f'F1 Score: {f1}')

In [89]:
 # Create Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol=label_column)

# Fit model to training data
rf_model = rf.fit(trainingData)



-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Random Forest -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: train
Accuracy: 0.8188997851057563
Weighted Precision: 0.6705968580462539
Weighted Recall: 0.8188997851057563
F1 Score: 0.7373653716796315
-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Random Forest -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: test
Accuracy: 0.8198284778837461
Weighted Precision: 0.67211873314918
Weighted Recall: 0.8198284778837461
F1 Score: 0.7386616280791228


In [None]:
evaluate_model(rf_model, trainingData, 'Random Forest', 'train')
evaluate_model(rf_model, validationData, 'Random Forest', 'validation')
evaluate_model(rf_model, testData, 'Random Forest', 'test')

In [90]:
evaluate_model(rf_model, testData, 'Random Forest', 'test')

-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Random Forest -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: test
Accuracy: 0.818084040186756
Weighted Precision: 0.6692614968082857
Weighted Recall: 0.818084040186756
F1 Score: 0.7362272392419641


## Save

In [None]:
model_path = "lrModel"
lrModel.save(model_path)