In [255]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession \
  .builder \
  .appName("Exercise for loan prediction classification") \
  .getOrCreate()

In [256]:
data = spark.read.csv("data/loan_prediction_problem_dataset.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [257]:
data \
    .select([F.count(F.when(F.col(c).isNotNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|    614|   601|    611|       599|      614|          582|            614|              614|       592|             600|           564|          614|        614|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [258]:
clean_df = data.dropna()

In [259]:
clean_df \
    .select([F.count(F.when(F.col(c).isNotNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|    480|   480|    480|       480|      480|          480|            480|              480|       480|             480|           480|          480|        480|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [260]:
clean_df.select("Loan_Status").show()

+-----------+
|Loan_Status|
+-----------+
|          N|
|          Y|
|          Y|
|          Y|
|          Y|
|          Y|
|          N|
|          Y|
|          N|
|          Y|
|          Y|
|          N|
|          Y|
|          Y|
|          N|
|          N|
|          N|
|          Y|
|          N|
|          Y|
+-----------+
only showing top 20 rows


In [261]:
train_data, test_data = clean_df.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())                                          

Train size:  409
Test size:  71


In [262]:
# train_data.select("*").show()

In [263]:
from pyspark.sql.types import StringType

stringCols = []
for column in train_data.columns:
    if isinstance(train_data.schema[column].dataType, StringType):
        # print(f"Column '{column}' is of type string.")
        stringCols.append(column)
stringCols

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [264]:
from pyspark.ml.feature import StringIndexer

for column in stringCols:
    indexer = StringIndexer(inputCol=column, outputCol=f'{column}_index')
    indexer_model = indexer.fit(train_data)
    train_data = indexer_model.transform(train_data)
    # test_data = indexer_model.transform(test_data)

# indexer = StringIndexer(inputCol='ocean_proximity', outputCol='ocean_proximity_index')
# indexer_model = indexer.fit(train_data)
# train_data = indexer_model.transform(train_data)
# train_data.show()


In [265]:
# test_data.show(5)

In [266]:
train_data.select("Loan_Status", "Loan_Status_index").show()

+-----------+-----------------+
|Loan_Status|Loan_Status_index|
+-----------+-----------------+
|          N|              1.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          Y|              0.0|
|          N|              1.0|
|          Y|              0.0|
|          N|              1.0|
|          N|              1.0|
|          N|              1.0|
|          Y|              0.0|
|          N|              1.0|
|          Y|              0.0|
|          Y|              0.0|
|          N|              1.0|
|          N|              1.0|
|          Y|              0.0|
+-----------+-----------------+
only showing top 20 rows


In [267]:
# train_data.show(5)
stringCols.remove('Loan_Status')

In [268]:
from pyspark.ml.feature import OneHotEncoder

for column in stringCols:
    encoder = OneHotEncoder(inputCol=f'{column}_index', outputCol=f'{column}_vec', dropLast=False)
    encoder_model = encoder.fit(train_data)
    train_data = encoder_model.transform(train_data)
    # test_data = encoder_model.transform(test_data)

# encoder = OneHotEncoder(inputCol='ocean_proximity_index', outputCol='ocean_proximity_vec', dropLast=False)
# encoder_model = encoder.fit(train_data)
# train_data = encoder_model.transform(train_data)


In [269]:
# train_data.show()

In [270]:
columns_for_assembler = []
for column in train_data.columns:
    # print(f'{column} -> {(train_data.schema[column].dataType)}')
    if not isinstance(train_data.schema[column].dataType, StringType) and \
    "_index" not in column:
        columns_for_assembler.append(column)
columns_for_assembler

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Loan_ID_vec',
 'Gender_vec',
 'Married_vec',
 'Dependents_vec',
 'Education_vec',
 'Self_Employed_vec',
 'Property_Area_vec']

In [271]:
# for column in train_data.columns:
#     print(f'{column} -> {(train_data.schema[column].dataType)}')

In [272]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=columns_for_assembler, outputCol='unscaled_features')

train_data = assembler.transform(train_data)
# test_data = assembler.transform(test_data)
# train_data.show()

In [273]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)
scaler_model = scaler.fit(train_data)
transformed_train_data = scaler_model.transform(train_data)
# test_data = scaler_model.transform(test_data)
# transformed_train_data.show()

In [274]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='Loan_Status_index')

model = lr.fit(transformed_train_data)

25/09/02 18:40:53 WARN Instrumentation: [4ac74b0a] regParam is zero, which might cause numerical instability and overfitting.
25/09/02 18:40:53 WARN Instrumentation: [4ac74b0a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [275]:
test_data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001006|  Male|    Yes|         0|Not Graduate|           No|           2583|           2358.0|       120|             360|             1|        Urban|          Y|
|LP001014|  Male|    Yes|        3+|    Graduate|           No|           3036|           2504.0|       158|             360|             0|    Semiurban|          N|
|LP001020|  Male|    Yes|         1|    Graduate|           No|          12841|          10968.0|       349|             360|             1|    Semiurban|          N

In [276]:
# test_data = indexer_model.transform(test_data)
# test_data = encoder_model.transform(test_data)
# test_data = assembler.transform(test_data)
# test_data = scaler_model.transform(test_data)
# test_data.show()

In [277]:
# test_predictions = model.transform(test_data)
# test_predictions.show()