 ## Machine Learning

In [29]:
!pip install pyspark



## Imports

In [1]:

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, countDistinct, isnan, when, count, round, substring_index,substring, split, regexp_replace, udf
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator,ClusteringEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, LinearSVC,GBTClassifier
from pyspark.ml.clustering import KMeans

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tabulate import tabulate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Functions

In [3]:
def evaluate_model(model, data, model_name , date_type):

    # prdict on data
    predictions = model.transform(data)

    # Create evaluators for different metrics
    evaluator_multi = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='accuracy')
    evaluator_weighted_precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedPrecision')
    evaluator_weighted_recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='weightedRecall')
    evaluator_f1 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=label_column, metricName='f1')

    # Calculate evaluation metrics
    accuracy = evaluator_multi.evaluate(predictions)
    weighted_precision = evaluator_weighted_precision.evaluate(predictions)
    weighted_recall = evaluator_weighted_recall.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)

    # Print results
    print('-------------------------------------------------------------------------------------------------------------------')
    print(f'---------------------------------------------- Model: {model_name} -----------------------------------------------')
    print('-------------------------------------------------------------------------------------------------------------------')
    print(f'Data Type: {date_type}')
    print(f'Accuracy: {accuracy}')
    print(f'Weighted Precision: {weighted_precision}')
    print(f'Weighted Recall: {weighted_recall}')
    print(f'F1 Score: {f1}')


In [4]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("LoanApproval")\
    .getOrCreate()


In [5]:

sc=spark.sparkContext


 ## Read Data - SBAnational.csv

In [6]:

data_path="/content/drive/MyDrive/Colab Notebooks/BD_project/50000.csv"


In [7]:

loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, multiLine=True, quote='"', escape='"')


In [8]:
loan_df.printSchema()
loan_df.show(5)

root
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Bank: string (nullable = true)
 |-- BankState: string (nullable = true)
 |-- Term: integer (nullable = true)
 |-- NoEmp: integer (nullable = true)
 |-- NewExist: integer (nullable = true)
 |-- CreateJob: integer (nullable = true)
 |-- RetainedJob: integer (nullable = true)
 |-- UrbanRural: integer (nullable = true)
 |-- RevLineCr: integer (nullable = true)
 |-- LowDoc: integer (nullable = true)
 |-- MIS_Status: integer (nullable = true)
 |-- Sector: integer (nullable = true)
 |-- ApprovalMonth: string (nullable = true)
 |-- IsFranchise: integer (nullable = true)
 |-- clean_GrAppv: double (nullable = true)

+--------------------+-----------+-----+----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+------+-------------+-----------+------------+
|                Name|     

In [9]:

print("Transforming categorial features...")
# List of categorical columns to be one-hot encoded
categorical_columns = ["Name", "City", "State", "Zip", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]
# ======================================================
# dropping these columns give better accuracy (by trial)
# ======================================================
loan_df = loan_df.drop('Name')
categorical_columns = ["City", "State", "Zip", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]

loan_df = loan_df.drop('Zip')
categorical_columns = ["City", "State", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]

loan_df = loan_df.drop('City')
categorical_columns = ["State", "Bank", "BankState", "UrbanRural", "RevLineCr", "LowDoc", "Sector", "ApprovalMonth"]
# ======================================================
# ======================================================
# ======================================================

# Define an empty list to store the pipeline stages
stages = []

# Iterate over each categorical column
for column in categorical_columns:
    # Define StringIndexer for the current column
    indexer = StringIndexer(inputCol=column, outputCol=column + "Index")

    # Define OneHotEncoder for the indexed column
    encoder = OneHotEncoder(inputCol=column + "Index", outputCol=column + "Vec")

    # Add StringIndexer and OneHotEncoder to the list of stages
    stages += [indexer, encoder]
label_column = "MIS_Status"



# Create VectorAssembler for combining all features
# List of input columns (excluding the label column and categorical columns)
input_columns = [col for col in loan_df.columns if col != label_column and col not in categorical_columns]
input_columns += [column + "Vec" for column in categorical_columns]
assembler = VectorAssembler(inputCols=input_columns , outputCol="features")

# Combine all stages into a Pipeline
pipeline = Pipeline(stages=stages + [assembler])

# Fit the pipeline to your data
pipeline_model = pipeline.fit(loan_df)

# Transform your data using the pipeline
transformed_data = pipeline_model.transform(loan_df)
transformed_data.show(5)
print("Splitting data into training, validation and test...")
# Split the transformed data into training and test sets (70% training, 30% test)
# (trainingData, testData) = transformed_data.randomSplit([0.7, 0.3])
(trainingData, validationData, testData) = transformed_data.randomSplit([0.6, 0.2, 0.2], seed=123)

Transforming categorial features...
+-----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+------+-------------+-----------+------------+----------+--------------+---------+------------------+--------------+---------------+---------------+-------------+--------------+-------------+-----------+-------------+-----------+--------------+------------------+----------------+--------------------+
|State|                Bank|BankState|Term|NoEmp|NewExist|CreateJob|RetainedJob|UrbanRural|RevLineCr|LowDoc|MIS_Status|Sector|ApprovalMonth|IsFranchise|clean_GrAppv|StateIndex|      StateVec|BankIndex|           BankVec|BankStateIndex|   BankStateVec|UrbanRuralIndex|UrbanRuralVec|RevLineCrIndex| RevLineCrVec|LowDocIndex|    LowDocVec|SectorIndex|     SectorVec|ApprovalMonthIndex|ApprovalMonthVec|            features|
+-----+--------------------+---------+----+-----+--------+---------+-----------+----------+---------+------+----------+---

## Logistic Regression

In [None]:
# Create a Logistic Regression model
lr = LogisticRegression(maxIter=10, elasticNetParam=0.8, labelCol=label_column, featuresCol="features")
print("Training logistic regression model...")
# Train the model
lrModel = lr.fit(trainingData)

In [14]:

# Make predictions on the test data
predictions = lrModel.transform(validationData)

# predictions.describe().show()
# Evaluate the model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol=label_column)
accuracy = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)


Accuracy: 0.7396289805425709


In [19]:
evaluate_model(lrModel, trainingData, 'Logistic Regression', 'train')
evaluate_model(lrModel, validationData, 'Logistic Regression', 'validation')
evaluate_model(lrModel, testData, 'Logistic Regression', 'test')


-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Logistic Regression -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: train
Accuracy: 0.8892502682403434
Weighted Precision: 0.8823189751461389
Weighted Recall: 0.8892502682403433
F1 Score: 0.8817209438871593
-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Logistic Regression -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: validation
Accuracy: 0.8795811518324608
Weighted Precision: 0.8708062421696577
Weighted Recall: 0.8795811518324608
F1 Score: 0.8713

## Random Forest

In [16]:

 # Create Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol=label_column)

# Fit model to training data
rf_model = rf.fit(trainingData)




In [17]:

evaluate_model(rf_model, trainingData, 'Random Forest', 'train')
evaluate_model(rf_model, validationData, 'Random Forest', 'validation')
evaluate_model(rf_model, testData, 'Random Forest', 'test')


-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Random Forest -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: train
Accuracy: 0.818468347639485
Weighted Precision: 0.6698904360877088
Weighted Recall: 0.818468347639485
F1 Score: 0.7367633722712625
-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: Random Forest -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: validation
Accuracy: 0.8203101847278474
Weighted Precision: 0.6729087991682352
Weighted Recall: 0.8203101847278474
F1 Score: 0.7393342132718342
-

## GBTClassifier

In [10]:

# Split the data into training and test sets (30% held out for testing)
# (trainingData, testData) = transformed_data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(featuresCol='features', labelCol=label_column, maxIter=1000)

# Train model.  This also runs the indexers.
gbt_model = gbt.fit(trainingData)

evaluate_model(gbt_model, trainingData, 'GBTClassifier', 'train')
evaluate_model(gbt_model, validationData, 'GBTClassifier', 'validation')
evaluate_model(gbt_model, testData, 'GBTClassifier', 'test')


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
  File "/usr/local/lib/python3.10/dist-packages/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python

Py4JError: py4j does not exist in the JVM

## SVM

In [12]:
lsvc = LinearSVC(featuresCol='features', labelCol=label_column,maxIter=100, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(trainingData)

evaluate_model(lsvcModel, trainingData, 'SVM', 'train')
evaluate_model(lsvcModel, validationData, 'SVM', 'validation')
evaluate_model(lsvcModel, testData, 'SVM', 'test')


Exception ignored in: <function JavaWrapper.__del__ at 0x7db5203020e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'KMeans' object has no attribute '_java_obj'


-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: SVM -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: train
Accuracy: 0.8309750536480687
Weighted Precision: 0.8150335588823321
Weighted Recall: 0.8309750536480687
F1 Score: 0.7757206985547561
-------------------------------------------------------------------------------------------------------------------
---------------------------------------------- Model: SVM -----------------------------------------------
-------------------------------------------------------------------------------------------------------------------
Data Type: validation
Accuracy: 0.8281141953966216
Weighted Precision: 0.8003421873974659
Weighted Recall: 0.8281141953966216
F1 Score: 0.7709744904088325
-------------------

## Kmeans

In [11]:
# Trains a k-means model.
kmeans = KMeans(featuresCol='features', predictionCol=label_column).setK(2).setSeed(1)
model = kmeans.fit(trainingData)

# Make predictions
predictions = model.transform(validationData)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("(validationData) Silhouette with squared euclidean distance = " + str(silhouette))
# Make predictions
predictions = model.transform(testData)

silhouette = evaluator.evaluate(predictions)
print("(testData) Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

IllegalArgumentException: requirement failed: Column MIS_Status already exists.

 ## Save

In [18]:
# model_path = "lrModel"
# lrModel.save(model_path)

