 ## Machine Learning

In [1]:
# !pip install pyspark

## Imports

In [2]:

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round, substring_index,substring, split, regexp_replace, udf
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator,ClusteringEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, LinearSVC,GBTClassifier
from pyspark.ml.clustering import KMeans

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tabulate import tabulate

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

## Functions

In [4]:
def evaluate_model(model, train_data, validation_data, test_data, model_name, label_column):
    # Evaluate training data
    train_predictions = model.transform(train_data)
    train_metrics = calculate_metrics(train_predictions, label_column)

    # Evaluate validation data
    validation_predictions = model.transform(validation_data)
    validation_metrics = calculate_metrics(validation_predictions, label_column)

    # Evaluate test data
    test_predictions = model.transform(test_data)
    test_metrics = calculate_metrics(test_predictions, label_column)

     # Prepare data for tabulate
    table = [
        ['Model', model_name,'',''],
        ['Metric', 'Training', 'Validation', 'Test'],
        ['Accuracy', train_metrics['accuracy'], validation_metrics['accuracy'], test_metrics['accuracy']],
        ['Weighted Precision', train_metrics['weighted_precision'], validation_metrics['weighted_precision'], test_metrics['weighted_precision']],
        ['Weighted Recall', train_metrics['weighted_recall'], validation_metrics['weighted_recall'], test_metrics['weighted_recall']],
        ['F1 Score', train_metrics['f1'], validation_metrics['f1'], test_metrics['f1']]
    ]

    # Display results using tabulate
    print(tabulate(table, headers="firstrow", tablefmt='grid'))

def calculate_metrics(predictions, label_column):
    evaluator_multi = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='accuracy')
    evaluator_weighted_precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedPrecision')
    evaluator_weighted_recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='f1')

    accuracy = evaluator_multi.evaluate(predictions)
    weighted_precision = evaluator_weighted_precision.evaluate(predictions)
    weighted_recall = evaluator_weighted_recall.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)

    return {
        'accuracy': accuracy,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall,
        'f1': f1
    }

In [5]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()


In [6]:

sc=spark.sparkContext


 ## Read Data - SBAnational.csv

In [7]:

# data_path="/content/drive/MyDrive/Colab Notebooks/BD_project/50000.csv"
data_path="../sample_data/50000.csv"


In [8]:

loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, multiLine=True, quote='"', escape='"')


In [9]:
loan_df.printSchema()
loan_df.show(5)

root
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: integer (nullable = true)
 |-- LowDoc: integer (nullable = true)
 |-- MIS_Status: integer (nullable = true)
 |-- Sector: integer (nullable = true)
 |-- ApprovalMonth: string (nullable = true)
 |-- IsFranchise: integer (nullable = true)
 |-- clean_DisbursementGross: double (nullable = true)
 |-- clean_ChgOffPrinGr: double (nullable = true)
 |-- clean_GrAppv: double (nullable = true)
 |-- clean_SBA_Appv: double (nullable = true)

+--------------------+--------------+-----+----+--------------------+---------+-

In [10]:

print("Transforming categorial features...")
# List of categorical columns to be one-hot encoded
categorical_columns = ["Name", "City", "State", "Zip", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]
# ======================================================
# dropping these columns give better accuracy (by trial)
# ======================================================
loan_df = loan_df.drop('Name')
categorical_columns = ["City", "State", "Zip", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]

loan_df = loan_df.drop('Zip')
categorical_columns = ["City", "State", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]

loan_df = loan_df.drop('City')
categorical_columns = ["State", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]
# ======================================================
# ======================================================
# ======================================================

# Define an empty list to store the pipeline stages
stages = []

# Iterate over each categorical column
for column in categorical_columns:
    # Define StringIndexer for the current column
    indexer = StringIndexer(inputCol=column, outputCol=column + "Index")

    # Define OneHotEncoder for the indexed column
    encoder = OneHotEncoder(inputCol=column + "Index", outputCol=column + "Vec")

    # Add StringIndexer and OneHotEncoder to the list of stages
    stages += [indexer, encoder]
label_column = "MIS_Status"



# Create VectorAssembler for combining all features
# List of input columns (excluding the label column and categorical columns)
input_columns = [col for col in loan_df.columns if col != label_column and col not in categorical_columns]
input_columns += [column + "Vec" for column in categorical_columns]
assembler = VectorAssembler(inputCols=input_columns , outputCol="features")

# Combine all stages into a Pipeline
pipeline = Pipeline(stages=stages + [assembler])

# Fit the pipeline to your data
pipeline_model = pipeline.fit(loan_df)

# Transform your data using the pipeline
transformed_data = pipeline_model.transform(loan_df)
transformed_data.show(5)
print("Splitting data into training, validation and test...")
# Split the transformed data into training and test sets (70% training, 30% test)
# (trainingData, testData) = transformed_data.randomSplit([0.7, 0.3])
(trainingData, validationData, testData) = transformed_data.randomSplit([0.6, 0.2, 0.2], seed=123)

Transforming categorial features...
+-----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+------+-------------+-----------+-----------------------+------------------+------------+--------------+----------+--------------+---------+------------------+--------------+---------------+---------------+-------------+--------------+-------------+-----------+-------------+-----------+--------------+------------------+----------------+--------------------+
|State|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|MIS_Status|Sector|ApprovalMonth|IsFranchise|clean_DisbursementGross|clean_ChgOffPrinGr|clean_GrAppv|clean_SBA_Appv|StateIndex|      StateVec|BankIndex|           BankVec|BankStateIndex|   BankStateVec|UrbanRuralIndex|UrbanRuralVec|RevLineCrIndex| RevLineCrVec|LowDocIndex|    LowDocVec|SectorIndex|     SectorVec|ApprovalMonthIndex|ApprovalMonthVec|            features|
+-----

## Logistic Regression

In [11]:
# Create a Logistic Regression model
lr = LogisticRegression(maxIter=10, elasticNetParam=0.8, labelCol=label_column, featuresCol="features")
print("Training logistic regression model...")
# Train the model
lrModel = lr.fit(trainingData)

Training logistic regression model...


In [12]:

# Make predictions on the test data
predictions = lrModel.transform(validationData)

# predictions.describe().show()
# Evaluate the model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol=label_column)
accuracy = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)


Accuracy: 0.936868094509943


In [13]:
evaluate_model(lrModel, trainingData, validationData, testData, 'Logistic Regression', label_column)

+--------------------+-----------------------+--------------------+--------------------+
| Model              | Logistic Regression   |                    |                    |
| Metric             | Training              | Validation         | Test               |
+--------------------+-----------------------+--------------------+--------------------+
| Accuracy           | 0.9799825643776824    | 0.9694754519411242 | 0.9667760867402765 |
+--------------------+-----------------------+--------------------+--------------------+
| Weighted Precision | 0.9798398663270723    | 0.9691101660145799 | 0.9663388220602683 |
+--------------------+-----------------------+--------------------+--------------------+
| Weighted Recall    | 0.9799825643776824    | 0.9694754519411242 | 0.9667760867402766 |
+--------------------+-----------------------+--------------------+--------------------+
| F1 Score           | 0.9798340511621209    | 0.9691221798955781 | 0.9663307080808471 |
+--------------------

## Random Forest

In [14]:

 # Create Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol=label_column)

# Fit model to training data
rf_model = rf.fit(trainingData)




In [15]:

evaluate_model(rf_model, trainingData, validationData, testData, 'Random Forest', label_column)


+--------------------+--------------------+--------------------+--------------------+
| Model              | Random Forest      |                    |                    |
| Metric             | Training           | Validation         | Test               |
+--------------------+--------------------+--------------------+--------------------+
| Accuracy           | 0.8159871244635193 | 0.821298034179591  | 0.8210484432507709 |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Precision | 0.665834987290243  | 0.6745304609472607 | 0.6741205461645143 |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.8159871244635193 | 0.821298034179591  | 0.8210484432507709 |
+--------------------+--------------------+--------------------+--------------------+
| F1 Score           | 0.7333036433140401 | 0.7407139834212855 | 0.7403653084166562 |
+--------------------+--------------------+-----------

## GBTClassifier

In [16]:

# Split the data into training and test sets (30% held out for testing)
# (trainingData, testData) = transformed_data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(featuresCol='features', labelCol=label_column, maxIter=100)
print("Training...")
# Train model.  This also runs the indexers.
gbt_model = gbt.fit(trainingData)

print("Evaluating...")
evaluate_model(gbt_model, trainingData, validationData, testData, 'GBTClassifier', label_column)


Training...
Evaluating...
+--------------------+--------------------+--------------------+--------------------+
| Model              | GBTClassifier      |                    |                    |
| Metric             | Training           | Validation         | Test               |
+--------------------+--------------------+--------------------+--------------------+
| Accuracy           | 0.9954063841201717 | 0.9940729032895387 | 0.992937431612454  |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Precision | 0.9954698078929716 | 0.9941113474118869 | 0.9929959235953381 |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.9954063841201717 | 0.9940729032895387 | 0.992937431612454  |
+--------------------+--------------------+--------------------+--------------------+
| F1 Score           | 0.9954216012061459 | 0.9940856266537357 | 0.9929564427916348 |
+--------------------+------

## SVM

In [17]:
lsvc = LinearSVC(featuresCol='features', labelCol=label_column,maxIter=100, regParam=0.1)
print("Training...")
# Fit the model
lsvcModel = lsvc.fit(trainingData)
print("Evaluating...")
evaluate_model(lsvcModel, trainingData, validationData, testData, 'SVM', label_column)



Training...
Evaluating...
+--------------------+--------------------+--------------------+--------------------+
| Model              | SVM                |                    |                    |
| Metric             | Training           | Validation         | Test               |
+--------------------+--------------------+--------------------+--------------------+
| Accuracy           | 0.8817730686695279 | 0.8781981626000197 | 0.8750621704963693 |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Precision | 0.8853828220351001 | 0.8795728016375877 | 0.8762376485083178 |
+--------------------+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.8817730686695279 | 0.8781981626000197 | 0.8750621704963693 |
+--------------------+--------------------+--------------------+--------------------+
| F1 Score           | 0.8618950261167185 | 0.8562748789789716 | 0.8516520115168842 |
+--------------------+------

## Kmeans

In [18]:
# Trains a k-means model.
kmeans = KMeans(featuresCol='features', predictionCol=label_column).setK(2).setSeed(1)
model = kmeans.fit(trainingData)

# Make predictions
predictions = model.transform(validationData)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("(validationData) Silhouette with squared euclidean distance = " + str(silhouette))
# Make predictions
predictions = model.transform(testData)

silhouette = evaluator.evaluate(predictions)
print("(testData) Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

IllegalArgumentException: requirement failed: Column MIS_Status already exists.

 ## Save

In [None]:
# model_path = "lrModel"
# lrModel.save(model_path)

