 ## Machine Learning

## Imports

In [1]:

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC,GBTClassifier
from tabulate import tabulate

## Functions

In [2]:
def evaluate_model(model, train_data, validation_data = None, test_data = None, model_name = "Model", label_column = "MIS_Status"):
    # Evaluate training data
    train_predictions = model.transform(train_data)
    train_metrics = calculate_metrics(train_predictions, label_column)

     # Prepare data for tabulate
    table = [
        ['Model', model_name],
        ['Metric', 'Training'],
        ['Accuracy', train_metrics['accuracy']],
        ['Weighted Precision', train_metrics['weighted_precision']],
        ['Weighted Recall', train_metrics['weighted_recall']],
        ['F1 Score', train_metrics['f1']]
    ]
    # Evaluate validation data
    if validation_data is not None:
        validation_predictions = model.transform(validation_data)
        validation_metrics = calculate_metrics(validation_predictions, label_column)
        table[0] += ['']
        table[1] += ['Validation']
        table[2] += [validation_metrics['accuracy']]
        table[3] += [validation_metrics['weighted_precision']]
        table[4] += [validation_metrics['weighted_recall']]
        table[5] += [validation_metrics['f1']]

    # Evaluate test data
    if test_data is not None:
        test_predictions = model.transform(test_data)
        test_metrics = calculate_metrics(test_predictions, label_column)
        table[0] += ['']
        table[1] += ['Test']
        table[2] += [test_metrics['accuracy']]
        table[3] += [test_metrics['weighted_precision']]
        table[4] += [test_metrics['weighted_recall']]
        table[5] += [test_metrics['f1']]


    # Display results using tabulate
    print(tabulate(table, headers="firstrow", tablefmt='grid'))

def calculate_metrics(predictions, label_column):
    evaluator_multi = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='accuracy')
    evaluator_weighted_precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedPrecision')
    evaluator_weighted_recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='f1')

    accuracy = evaluator_multi.evaluate(predictions)
    weighted_precision = evaluator_weighted_precision.evaluate(predictions)
    weighted_recall = evaluator_weighted_recall.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)

    return {
        'accuracy': accuracy,
        'weighted_precision': weighted_precision,
        'weighted_recall': weighted_recall,
        'f1': f1
    }

In [3]:

# spark=SparkSession.builder\
#     .master("local[*]")\
#     .appName("LoanApproval")\
#     .getOrCreate()
spark=SparkSession.builder\
    .appName("LoanApproval")\
    .getOrCreate()

sc=spark.sparkContext

 ## Read Data

In [4]:

# data_path="/content/drive/MyDrive/Colab Notebooks/BD_project/50000.csv"
# data_path="../sample_data/50000_1.csv"
data_path="../data/preprocessed_2.csv"


In [5]:

loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, multiLine=True, quote='"', escape='"')

In [6]:
loan_df.printSchema()
loan_df.show(5)

root
 |-- State: string (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: integer (nullable = true)
 |-- LowDoc: integer (nullable = true)
 |-- Sector: integer (nullable = true)
 |-- IsFranchise: integer (nullable = true)
 |-- clean_DisbursementGross: double (nullable = true)
 |-- MIS_Status: integer (nullable = true)
 |-- clean_GrAppv: double (nullable = true)



+-----+--------------------+---------+----+-----+--------+---------+----------+---------+------+------+-----------+-----------------------+----------+------------+
|State|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|UrbanRural|RevLineCr|LowDoc|Sector|IsFranchise|clean_DisbursementGross|MIS_Status|clean_GrAppv|
+-----+--------------------+---------+----+-----+--------+---------+----------+---------+------+------+-----------+-----------------------+----------+------------+
|   MA|TD BANK, NATIONAL...|       DE|  84|    2|       0|        0|         1|        1|     0|    44|          0|                25959.0|         1|     10000.0|
|   MA|CITIZENS BANK NAT...|       RI|  84|    7|       0|        0|         1|        1|     0|    23|          0|                98479.0|         1|     50000.0|
|   MA|FLORENCE SAVINGS ...|       MA|  60|    2|       0|        0|         1|        1|     0|    23|          0|               135070.0|         1|     35000.0|
|   MA|CITIZENS 

In [7]:

print("Transforming categorial features...")
# List of categorical columns to be one-hot encoded
# categorical_columns = ["Name", "City", "State", "Zip", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]
categorical_columns = ["State", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector"]

# Define an empty list to store the pipeline stages
stages = []

# Iterate over each categorical column
for column in categorical_columns:
    # Define StringIndexer for the current column
    indexer = StringIndexer(inputCol=column, outputCol=column + "Index")

    # Define OneHotEncoder for the indexed column
    encoder = OneHotEncoder(inputCol=column + "Index", outputCol=column + "Vec")

    # Add StringIndexer and OneHotEncoder to the list of stages
    stages += [indexer, encoder]

label_column = "MIS_Status"

# Create VectorAssembler for combining all features
# List of input columns (excluding the label column and categorical columns)
input_columns = [col for col in loan_df.columns if col != label_column and col not in categorical_columns]
input_columns += [column + "Vec" for column in categorical_columns]
assembler = VectorAssembler(inputCols=input_columns , outputCol="features")

# Combine all stages into a Pipeline
pipeline = Pipeline(stages=stages + [assembler])

# Fit the pipeline to your data
pipeline_model = pipeline.fit(loan_df)

# Transform your data using the pipeline
transformed_data = pipeline_model.transform(loan_df)
transformed_data.show(5)
print("Splitting data into training, validation and test...")
# Split the transformed data into training and test sets (60% training, 20% validation, 20% test)
(trainingData, validationData, testData) = transformed_data.randomSplit([0.6, 0.2, 0.2], seed=123)

Transforming categorial features...
+-----+--------------------+---------+----+-----+--------+---------+----------+---------+------+------+-----------+-----------------------+----------+------------+----------+--------------+---------+------------------+--------------+---------------+---------------+-------------+--------------+-------------+-----------+-------------+-----------+--------------+--------------------+
|State|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|UrbanRural|RevLineCr|LowDoc|Sector|IsFranchise|clean_DisbursementGross|MIS_Status|clean_GrAppv|StateIndex|      StateVec|BankIndex|           BankVec|BankStateIndex|   BankStateVec|UrbanRuralIndex|UrbanRuralVec|RevLineCrIndex| RevLineCrVec|LowDocIndex|    LowDocVec|SectorIndex|     SectorVec|            features|
+-----+--------------------+---------+----+-----+--------+---------+----------+---------+------+------+-----------+-----------------------+----------+------------+----------+--------------+---------

## Logistic Regression

In [8]:
# Create a Logistic Regression model
lr = LogisticRegression(maxIter=10, labelCol=label_column, featuresCol="features")
# Train the model
lrModel = lr.fit(trainingData)

In [9]:
evaluate_model(lrModel, trainingData, validation_data=validationData, model_name='Logistic Regression', label_column=label_column)

+--------------------+-----------------------+--------------------+
| Model              | Logistic Regression   |                    |
| Metric             | Training              | Validation         |
+--------------------+-----------------------+--------------------+
| Accuracy           | 0.8831928966699047    | 0.8795504738930949 |
+--------------------+-----------------------+--------------------+
| Weighted Precision | 0.8751508919841021    | 0.870687028633707  |
+--------------------+-----------------------+--------------------+
| Weighted Recall    | 0.8831928966699047    | 0.8795504738930949 |
+--------------------+-----------------------+--------------------+
| F1 Score           | 0.873417741176784     | 0.8693815107772412 |
+--------------------+-----------------------+--------------------+


## Random Forest

In [10]:

 # Create Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol=label_column)

# Fit model to training data
rf_model = rf.fit(trainingData)

In [11]:
evaluate_model(rf_model, trainingData, validation_data=validationData, model_name='Random Forest', label_column=label_column)

+--------------------+--------------------+--------------------+
| Model              | Random Forest      |                    |
| Metric             | Training           | Validation         |
+--------------------+--------------------+--------------------+
| Accuracy           | 0.819188705399535  | 0.8193583088169379 |
+--------------------+--------------------+--------------------+
| Weighted Precision | 0.6710701350541662 | 0.6713480382273527 |
+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.819188705399535  | 0.8193583088169379 |
+--------------------+--------------------+--------------------+
| F1 Score           | 0.7377685811948619 | 0.7380053010711294 |
+--------------------+--------------------+--------------------+


## GBTClassifier

In [12]:

# Train a GBT model.
gbt = GBTClassifier(featuresCol='features', labelCol=label_column, maxIter=100)
# Train model.  This also runs the indexers.
gbt_model = gbt.fit(trainingData)

In [13]:
evaluate_model(gbt_model, trainingData, validation_data=validationData, model_name='GBTClassifier', label_column=label_column)

+--------------------+--------------------+--------------------+
| Model              | GBTClassifier      |                    |
| Metric             | Training           | Validation         |
+--------------------+--------------------+--------------------+
| Accuracy           | 0.9336846069046814 | 0.9333615849136561 |
+--------------------+--------------------+--------------------+
| Weighted Precision | 0.9317428329645383 | 0.9313862085110264 |
+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.9336846069046814 | 0.9333615849136562 |
+--------------------+--------------------+--------------------+
| F1 Score           | 0.931868760107535  | 0.931480025006692  |
+--------------------+--------------------+--------------------+


## SVM

In [14]:
lsvc = LinearSVC(featuresCol='features', labelCol=label_column,maxIter=100)
# Fit the model
lsvcModel = lsvc.fit(trainingData)

In [15]:
evaluate_model(lsvcModel, trainingData, validation_data=validationData, model_name='SVM', label_column=label_column)

+--------------------+--------------------+--------------------+
| Model              | SVM                |                    |
| Metric             | Training           | Validation         |
+--------------------+--------------------+--------------------+
| Accuracy           | 0.8885684192143551 | 0.8845135159363362 |
+--------------------+--------------------+--------------------+
| Weighted Precision | 0.881558972706747  | 0.8768871708897702 |
+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.888568419214355  | 0.8845135159363362 |
+--------------------+--------------------+--------------------+
| F1 Score           | 0.8818351978586784 | 0.8775174803568296 |
+--------------------+--------------------+--------------------+


### Train the highest performing model on Train + Validation Data and test on Test Data

In [16]:
combined_data = trainingData.union(validationData)
# Train a GBT model.
gbt_model = gbt.fit(combined_data)

In [17]:
evaluate_model(gbt_model, combined_data, test_data=testData, model_name='GBTClassifier', label_column=label_column)

+--------------------+--------------------+--------------------+
| Model              | GBTClassifier      |                    |
| Metric             | Training           | Test               |
+--------------------+--------------------+--------------------+
| Accuracy           | 0.9357418389437082 | 0.9345360866765039 |
+--------------------+--------------------+--------------------+
| Weighted Precision | 0.9339385471483829 | 0.9326782945494004 |
+--------------------+--------------------+--------------------+
| Weighted Recall    | 0.9357418389437082 | 0.9345360866765039 |
+--------------------+--------------------+--------------------+
| F1 Score           | 0.9340707244461649 | 0.9328215721040373 |
+--------------------+--------------------+--------------------+


 ## Save to HDFS

In [18]:
# model_path = "./gbt_model"
# gbt_model.save(model_path)

