In [169]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession \
  .builder \
  .appName("Exercise for loan prediction classification") \
  .getOrCreate()

In [170]:
data = spark.read.csv("data/loan_prediction_problem_dataset.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [171]:
data \
    .select([F.count(F.when(F.col(c).isNotNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|    614|   601|    611|       599|      614|          582|            614|              614|       592|             600|           564|          614|        614|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [172]:
clean_df = data.dropna()

In [173]:
clean_df \
    .select([F.count(F.when(F.col(c).isNotNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|    480|   480|    480|       480|      480|          480|            480|              480|       480|             480|           480|          480|        480|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [174]:
clean_df.select("Loan_Status").show()

+-----------+
|Loan_Status|
+-----------+
|          N|
|          Y|
|          Y|
|          Y|
|          Y|
|          Y|
|          N|
|          Y|
|          N|
|          Y|
|          Y|
|          N|
|          Y|
|          Y|
|          N|
|          N|
|          N|
|          Y|
|          N|
|          Y|
+-----------+
only showing top 20 rows


In [175]:
train_data, test_data = clean_df.randomSplit([0.8, 0.2], seed=42)
ph_train_data = train_data
ph_test_data = test_data
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())                                          

Train size:  409
Test size:  71


In [176]:
pipeline_stages = []

In [177]:
from pyspark.sql.types import StringType

stringCols = []
for column in train_data.columns:
    if isinstance(train_data.schema[column].dataType, StringType):
        # print(f"Column '{column}' is of type string.")
        stringCols.append(column)
stringCols

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [178]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stages=[]
for column in stringCols:
    indexer = StringIndexer(inputCol=column, outputCol=f'{column}_index', handleInvalid="keep")
    # indexer_model = indexer.fit(train_data)
    stages.append(indexer)
    
indexer_pipeline = Pipeline(stages=stages)
#
pipeline_stages.append(indexer_pipeline)
#
indexer_pipeline_model = indexer_pipeline.fit(train_data)
    
train_data = indexer_pipeline_model.transform(train_data)
train_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Loan_ID_index|Gender_index|Married_index|Dependents_index|Education_index|Self_Employed_index|Property_Area_index|Loan_Status_index|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|      

In [179]:
train_data.select("Loan_Status", "Loan_Status_index").show()

+-----------+-----------------+
|Loan_Status|Loan_Status_index|
+-----------+-----------------+
|          N|              1.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          N|              1.0|
|          Y|              0.0|
|          N|              1.0|
|          N|              1.0|
|          N|              1.0|
|          Y|              0.0|
|          N|              1.0|
|          Y|              0.0|
|          Y|              0.0|
|          N|              1.0|
|          N|              1.0|
|          Y|              0.0|
+-----------+-----------------+
only showing top 20 rows


In [180]:
# train_data.show(5)
stringCols.remove('Loan_Status')

In [181]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline

e_stages=[]
for column in stringCols:
    encoder = OneHotEncoder(inputCol=f'{column}_index', outputCol=f'{column}_vec', dropLast=False)
    e_stages.append(encoder)

encoder_pipeline = Pipeline(stages=e_stages)
#
pipeline_stages.append(encoder_pipeline)
#
encoder_pipeline_model = encoder_pipeline.fit(train_data)
train_data = encoder_pipeline_model.transform(train_data)
train_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+---------------+-------------+-------------+--------------+-------------+-----------------+-----------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Loan_ID_index|Gender_index|Married_index|Dependents_index|Education_index|Self_Employed_index|Property_Area_index|Loan_Status_index|    Loan_ID_vec|   Gender_vec|  Married_vec|Dependents_vec|Education_vec|Self_Employed_vec|Property_Area_vec|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+----

In [182]:
columns_for_assembler = []
for column in train_data.columns:
    # print(f'{column} -> {(train_data.schema[column].dataType)}')
    if not isinstance(train_data.schema[column].dataType, StringType) and \
    "_index" not in column:
        columns_for_assembler.append(column)
columns_for_assembler

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Loan_ID_vec',
 'Gender_vec',
 'Married_vec',
 'Dependents_vec',
 'Education_vec',
 'Self_Employed_vec',
 'Property_Area_vec']

In [183]:
# for column in train_data.columns:
#     print(f'{column} -> {(train_data.schema[column].dataType)}')

In [184]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=columns_for_assembler, outputCol='unscaled_features')
#
pipeline_stages.append(assembler)
#
train_data = assembler.transform(train_data)

In [185]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)
#
pipeline_stages.append(scaler)
#
scaler_model = scaler.fit(train_data)
transformed_train_data = scaler_model.transform(train_data)
# transformed_train_data.show()

In [186]:
transformed_train_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+---------------+-------------+-------------+--------------+-------------+-----------------+-----------------+--------------------+--------------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Loan_ID_index|Gender_index|Married_index|Dependents_index|Education_index|Self_Employed_index|Property_Area_index|Loan_Status_index|    Loan_ID_vec|   Gender_vec|  Married_vec|Dependents_vec|Education_vec|Self_Employed_vec|Property_Area_vec|   unscaled_features|            features|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+---

In [187]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Loan_Status_index')
#
pipeline_stages.append(dtc)
#
model = dtc.fit(transformed_train_data)

In [188]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=pipeline_stages)

In [189]:
transformed_train_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+---------------+-------------+-------------+--------------+-------------+-----------------+-----------------+--------------------+--------------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Loan_ID_index|Gender_index|Married_index|Dependents_index|Education_index|Self_Employed_index|Property_Area_index|Loan_Status_index|    Loan_ID_vec|   Gender_vec|  Married_vec|Dependents_vec|Education_vec|Self_Employed_vec|Property_Area_vec|   unscaled_features|            features|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+---

In [190]:
test_data = indexer_pipeline_model.transform(test_data)
test_data = encoder_pipeline_model.transform(test_data)
test_data = assembler.transform(test_data)
test_data = scaler_model.transform(test_data)


In [191]:
test_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+-------------+------------+-------------+----------------+---------------+-------------------+-------------------+-----------------+-----------------+-------------+-------------+--------------+-------------+-----------------+-----------------+--------------------+--------------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Loan_ID_index|Gender_index|Married_index|Dependents_index|Education_index|Self_Employed_index|Property_Area_index|Loan_Status_index|      Loan_ID_vec|   Gender_vec|  Married_vec|Dependents_vec|Education_vec|Self_Employed_vec|Property_Area_vec|   unscaled_features|            features|
+--------+------+-------+----------+------------+-------------+---------------+-----------------

In [192]:
test_predictions = model.transform(test_data)

In [193]:
test_predictions.select("Loan_Status_index", "prediction").show(5)

+-----------------+----------+
|Loan_Status_index|prediction|
+-----------------+----------+
|              0.0|       0.0|
|              1.0|       1.0|
|              1.0|       0.0|
|              0.0|       0.0|
|              0.0|       0.0|
+-----------------+----------+
only showing top 5 rows


In [194]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', predictionCol='prediction', metricName='accuracy')
accuracy = accuracy_evaluator.evaluate(test_predictions) * 100
print(f'Accuracy = {accuracy:.2f}%')

Accuracy = 78.87%


In [195]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', predictionCol='prediction', metricName='precisionByLabel')
precision = precision_evaluator.evaluate(test_predictions) * 100
print(f'Precision = {precision:.2f}%')

Precision = 78.95%


In [196]:
ph_train_data.show(2)

+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001003|  Male|    Yes|         1| Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0| Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
only showing top

In [197]:
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

paramGrid = ParamGridBuilder() \
  .addGrid(dtc.maxDepth, [2, 5, 10]) \
  .addGrid(dtc.maxBins, [10, 20, 40]) \
  .addGrid(dtc.minInstancesPerNode, [1, 2, 4]) \
  .build()


evaluator = MulticlassClassificationEvaluator(
    labelCol='Loan_Status_index',
    predictionCol='prediction',
    metricName='accuracy'
)

tvs_model = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    seed=42
)

best_model = tvs_model.fit(ph_train_data)

In [198]:
tvs_preds = best_model.transform(ph_test_data)

In [199]:
accuracy = evaluator.evaluate(tvs_preds) * 100
print(f'Accuracy = {accuracy:.2f}%')

Accuracy = 71.83%


In [200]:
for param_values, metric_value in zip(paramGrid, best_model.validationMetrics):
    for param, value in param_values.items():
        print(f'{param.name} = {value}')
    print(f'Accuracy -> {metric_value}')

maxDepth = 2
maxBins = 10
minInstancesPerNode = 1
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 10
minInstancesPerNode = 2
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 10
minInstancesPerNode = 4
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 20
minInstancesPerNode = 1
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 20
minInstancesPerNode = 2
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 20
minInstancesPerNode = 4
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 40
minInstancesPerNode = 1
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 40
minInstancesPerNode = 2
Accuracy -> 0.7721518987341772
maxDepth = 2
maxBins = 40
minInstancesPerNode = 4
Accuracy -> 0.7721518987341772
maxDepth = 5
maxBins = 10
minInstancesPerNode = 1
Accuracy -> 0.7721518987341772
maxDepth = 5
maxBins = 10
minInstancesPerNode = 2
Accuracy -> 0.7848101265822784
maxDepth = 5
maxBins = 10
minInstancesPerNode = 4
Accuracy -> 0.759493670886076
maxDepth = 5
maxBins = 20
min