# Raw data will go to Bronze layer

In [2]:
from minio import Minio
from minio.error import S3Error
import os

In [2]:
# Connect to minio cline
def get_minio_client():
    client = Minio(
        "localhost:9000",
        access_key = os.getenv("MINIO_USERNAME"),
        secret_key = os.getenv("MINIO_PASSWORD"),
        secure = False
    )
    return client

In [3]:
def upload_file(bucket_name, file_path):
    try:
        client = get_minio_client()

        # Check if the bucket exists, if not, create it
        if not client.bucket_exists(bucket_name):
            print(f"Bucket '{bucket_name}' does not exist. Creating it now.")
            client.make_bucket(bucket_name)

        # Ensure the file path is valid
        if not os.path.isfile(file_path):
            print(f"The file path '{file_path}' does not exist or is not a file.")
            return

        # Extract the file name from the file path
        file_name = os.path.basename(file_path)

        # Upload the file to the specified bucket
        client.fput_object(bucket_name, file_name, file_path)
        print(f"File '{file_name}' uploaded successfully to bucket '{bucket_name}'.")

    except S3Error as e:
        print(f"S3 Error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [10]:
# Upload raw data to bronze bucket

# upload_file("bronze", "/home/drissdo/Desktop/Scalable-Distributed-Systems/data/Loan_default.csv")

# Data Acquisition and Preprocessing

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean

# Run full cores
spark = SparkSession.builder \
    .appName("LoanDefaultPrediction") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "admin123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")\
    .config("spark.jars", "/home/drissdo/Desktop/Scalable-Distributed-Systems/src/jars/aws-java-sdk-bundle-1.11.901.jar, /home/drissdo/Desktop/Scalable-Distributed-Systems/src/jars/hadoop-aws-3.3.1.jar")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .getOrCreate()

24/12/27 00:00:30 WARN Utils: Your hostname, dtdat resolves to a loopback address: 127.0.1.1; using 192.168.2.12 instead (on interface wlp0s20f3)
24/12/27 00:00:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/12/27 00:00:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/27 00:00:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
data_path = "/home/drissdo/Desktop/Scalable-Distributed-Systems/data/Loan_default.csv"


loan_data = spark.read.csv(data_path, inferSchema=True, header=True)

# Display the first 5 rows
loan_data.show(5)

+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36|    0.44| Bachelor's|     Full-time|     Divorced|        Yes|          Yes|      Other|        Yes|      0|
|HPSK72WA7R| 69| 50432|    124440|        458|            15|             1|        4.81|      60|    0.68|   Master's|     Full-time|      Married|         No|           N

In [6]:
loan_data.columns

['LoanID',
 'Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio',
 'Education',
 'EmploymentType',
 'MaritalStatus',
 'HasMortgage',
 'HasDependents',
 'LoanPurpose',
 'HasCoSigner',
 'Default']

In [7]:
loan_data.printSchema()

root
 |-- LoanID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- MonthsEmployed: integer (nullable = true)
 |-- NumCreditLines: integer (nullable = true)
 |-- InterestRate: double (nullable = true)
 |-- LoanTerm: integer (nullable = true)
 |-- DTIRatio: double (nullable = true)
 |-- Education: string (nullable = true)
 |-- EmploymentType: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- HasMortgage: string (nullable = true)
 |-- HasDependents: string (nullable = true)
 |-- LoanPurpose: string (nullable = true)
 |-- HasCoSigner: string (nullable = true)
 |-- Default: integer (nullable = true)



# Preprocess the Data

In [6]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql.functions import col
from pyspark.ml import Pipeline

# Data dont have missing value
# imputer = Imputer(
#     inputCols=["Income", "MonthsEmployed", "NumCreditLines", "InterestRate", "LoanTerm", "DTIRatio"],
#     outputCols=["Income_filled", "MonthsEmployed_filled", "NumCreditLines_filled", "InterestRate_filled", "LoanTerm_filled", "DTIRatio_filled"]
# )
# loan_data_imputed = imputer.fit(loan_data).transform(loan_data)

string_indexers = [
    StringIndexer(inputCol="Education", outputCol="Education_index"),
    StringIndexer(inputCol="EmploymentType", outputCol="EmploymentType_index"),
    StringIndexer(inputCol="MaritalStatus", outputCol="MaritalStatus_index"),
    StringIndexer(inputCol="HasMortgage", outputCol="HasMortgage_index"),
    StringIndexer(inputCol="HasDependents", outputCol="HasDependents_index"),
    StringIndexer(inputCol="LoanPurpose", outputCol="LoanPurpose_index"),
    StringIndexer(inputCol="HasCoSigner", outputCol="HasCoSigner_index")
]

pipeline_indexers = Pipeline(stages=string_indexers)
loan_data_indexed = pipeline_indexers.fit(loan_data).transform(loan_data)

loan_data_indexed = loan_data_indexed.select(['LoanID',
 'Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio',
 'LoanPurpose',
 'Default',
 'Education_index',
 'EmploymentType_index',
 'MaritalStatus_index',
 'HasMortgage_index',
 'HasDependents_index',
 'LoanPurpose_index',
 'HasCoSigner_index'])


In [7]:
loan_data.show()

+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36|    0.44| Bachelor's|     Full-time|     Divorced|        Yes|          Yes|      Other|        Yes|      0|
|HPSK72WA7R| 69| 50432|    124440|        458|            15|             1|        4.81|      60|    0.68|   Master's|     Full-time|      Married|         No|           N

In [8]:
loan_data_indexed.show()

+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|LoanPurpose|Default|Education_index|EmploymentType_index|MaritalStatus_index|HasMortgage_index|HasDependents_index|LoanPurpose_index|HasCoSigner_index|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36|    0.44|      Other|      0|            0.0|                 3.0|                1.0|              0.0|                0.0| 

In [9]:
# Normalize numerical features
numerical_cols = [
    "Age", "Income", "LoanAmount", "CreditScore", "MonthsEmployed",
    "NumCreditLines", "InterestRate", "LoanTerm", "DTIRatio"
]

assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
loan_data_assembled = assembler.transform(loan_data_indexed)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(loan_data_assembled)
loan_data_scaled = scaler_model.transform(loan_data_assembled)

                                                                                

In [15]:
loan_data_scaled.show(5)

+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+--------------------+--------------------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|LoanPurpose|Default|Education_index|EmploymentType_index|MaritalStatus_index|HasMortgage_index|HasDependents_index|LoanPurpose_index|HasCoSigner_index|            features|     scaled_features|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+--------------------+--------------------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36

In [12]:
def preprocess_data(df):
    

    string_indexers = [
        StringIndexer(inputCol="Education", outputCol="Education_index"),
        StringIndexer(inputCol="EmploymentType", outputCol="EmploymentType_index"),
        StringIndexer(inputCol="MaritalStatus", outputCol="MaritalStatus_index"),
        StringIndexer(inputCol="HasMortgage", outputCol="HasMortgage_index"),
        StringIndexer(inputCol="HasDependents", outputCol="HasDependents_index"),
        StringIndexer(inputCol="LoanPurpose", outputCol="LoanPurpose_index"),
        StringIndexer(inputCol="HasCoSigner", outputCol="HasCoSigner_index")]
    
    pipeline_indexers = Pipeline(stages=string_indexers)
    df_indexed = pipeline_indexers.fit(df).transform(df)


    pipeline_encoders = Pipeline(stages=one_hot_encoders)
    df_encoded = pipeline_encoders.fit(df_indexed).transform(df_indexed)

    # Normalize numerical features
    numerical_cols = [
        "Age", "Income_filled", "LoanAmount", "CreditScore", "MonthsEmployed_filled",
        "NumCreditLines_filled", "InterestRate_filled", "LoanTerm_filled", "DTIRatio_filled"
    ]

    assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
    df_assembled = assembler.transform(df_encoded)

    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)

    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
    scaler_model = scaler.fit(df_assembled)
    df_scaled = scaler_model.transform(df_assembled)

    

    return df_scaled


In [13]:
data = preprocess_data(loan_data).show(20)

NameError: name 'one_hot_encoders' is not defined

In [14]:
data = data.select(["features"])

NameError: name 'data' is not defined

In [23]:
# Save data to sliver bucket
silver_bucket_path = "s3a://sliver/preprocessed_loan_data"
loan_data_scaled.repartition(1).write.parquet(silver_bucket_path,  mode="overwrite")

                                                                                

In [16]:
from pyspark.sql.functions import col, rand


# loan_data_scaled = spark.read.parquet(silver_bucket_path)

default_data = loan_data_scaled.filter(col("Default") == 1)
non_default_data = loan_data_scaled.filter(col("Default") == 0)

train_default, test_default = default_data.randomSplit([0.8, 0.2], seed=42)
train_non_default, test_non_default = non_default_data.randomSplit([0.8, 0.2], seed=42)

train_data = train_default.union(train_non_default)
test_data = test_default.union(test_non_default)

# Shuffle the data
train_data = train_data.orderBy(rand(seed=42))
test_data = test_data.orderBy(rand(seed=42))

In [17]:
test_data.columns

['LoanID',
 'Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio',
 'LoanPurpose',
 'Default',
 'Education_index',
 'EmploymentType_index',
 'MaritalStatus_index',
 'HasMortgage_index',
 'HasDependents_index',
 'LoanPurpose_index',
 'HasCoSigner_index',
 'features',
 'scaled_features']

In [18]:
test_data.select(["features"]).show(1, truncate=True)

[Stage 52:>                                                       (0 + 12) / 12]

+--------------------+
|            features|
+--------------------+
|[27.0,17846.0,199...|
+--------------------+
only showing top 1 row



                                                                                

In [77]:
test_data.show(5, truncate=True)



+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+--------------------+--------------------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|LoanPurpose|Default|Education_index|EmploymentType_index|MaritalStatus_index|HasMortgage_index|HasDependents_index|LoanPurpose_index|HasCoSigner_index|            features|     scaled_features|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+-------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+--------------------+--------------------+
|34A3QK885V| 27| 17846|    199268|        530|            87|             4|        20.7|      48

                                                                                

In [19]:
train_data = train_data.select(["LoanID", "Default","features"])
test_data = test_data.select(["LoanID","Default", "features"])

In [29]:
test_data.show(5)



+----------+--------------------+
|    LoanID|     scaled_features|
+----------+--------------------+
|7YG865MCWP|[-0.2333719759002...|
|BEHFNNEUT1|[-1.1673118459778...|
|T6RFSXUUDJ|[0.63385790345760...|
|J2BKQP9IU7|[1.70121775497492...|
|XHZW2365P0|[-1.4341518088571...|
+----------+--------------------+
only showing top 5 rows



                                                                                

# Logistic Regression

In [21]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol="features", labelCol="Default", maxIter=10)
lr_model = lr.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)

# # Evaluate the model
# evaluator = BinaryClassificationEvaluator(labelCol="Default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
# roc_auc_lr = evaluator.evaluate(lr_predictions)
# print(f"Logistic Regression - ROC AUC: {roc_auc_lr}")

24/12/27 00:01:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS

In [22]:
lr_predictions_filtered = lr_predictions.filter(lr_predictions["prediction"] == "1")
print(lr_predictions_filtered.show())



+----------+-------+--------------------+--------------------+--------------------+----------+
|    LoanID|Default|            features|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------------+----------+
|49JA5VPNKF|      0|[19.0,39227.0,137...|[-0.2742316600795...|[0.43186852506906...|       1.0|
|L0W7T47R9F|      0|[25.0,34235.0,220...|[-0.1507953048443...|[0.46237244855771...|       1.0|
|8O4MIYPA7X|      1|[18.0,27322.0,207...|[-0.0410463675929...|[0.48973984859024...|       1.0|
|MLE308KIZ4|      0|[28.0,43782.0,245...|[-0.0131252881382...|[0.49671872507160...|       1.0|
|IU3NEZMQ9V|      0|[18.0,15602.0,164...|[-0.1193613301508...|[0.47019504531081...|       1.0|
|TGO4OJKEJ1|      1|[26.0,41254.0,199...|[-0.0219395143697...|[0.49451534140563...|       1.0|
|5Y6PHO1KWV|      0|[20.0,40670.0,143...|[-0.0653847611123...|[0.48365963079106...|       1.0|
|A1OOA2LRMM|      1|[20.0,37370.0,208...|[-0.22631

                                                                                

In [23]:
print("Result:", lr_predictions.select("LoanID", "prediction").show())

+----------+----------+
|    LoanID|prediction|
+----------+----------+
|34A3QK885V|       0.0|
|CHA1MSRIJ6|       0.0|
|44NZIPZAOJ|       0.0|
|DPE68UOI5R|       0.0|
|BE04G5KYN3|       0.0|
|D3NB8FM4O6|       0.0|
|KJE40OMKWC|       0.0|
|YPTUERR6Q6|       0.0|
|1JDM6KSG9C|       0.0|
|O4Z9BAWKIP|       0.0|
|JWXJ2NP61G|       0.0|
|JEFJN3BIUK|       0.0|
|SR6GZVK6ZY|       0.0|
|C383BGP0P3|       0.0|
|05GK5TFA8V|       0.0|
|A8AGYE8IN5|       0.0|
|90ONLFQD7S|       0.0|
|XW7FE0HOUO|       0.0|
|TQ7YKS3BQH|       0.0|
|NVUTOK3O3X|       0.0|
+----------+----------+
only showing top 20 rows

Result: None


In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="accuracy")
accuracy_lr = accuracy_evaluator.evaluate(lr_predictions)

                                                                                

In [25]:
print(accuracy_lr)

0.8869761951603384


In [26]:
# save model
model_path = "/home/drissdo/Desktop/Scalable-Distributed-Systems/ML/model"


import os
if os.path.exists(model_path):
    print(f"Path {model_path} already exists. Consider removing it or choosing a new path.")

# Save the model
lr_model.write().overwrite().save(model_path)


Path /home/drissdo/Desktop/Scalable-Distributed-Systems/ML/model already exists. Consider removing it or choosing a new path.


In [None]:
from pyspark.ml.classification import LogisticRegressionModel

# Load the saved model
loadedModel = LogisticRegressionModel.load(model_path)

# Verify by making predictions using the loaded model
lr_predictions = loadedModel.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="Default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc_lr = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression - ROC AUC: {roc_auc_lr}")

                                                                                

Logistic Regression - ROC AUC: 0.7336530388666921


# Decision Tree

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol="scaled_features", labelCol="Default", maxDepth=10)
dt_model = dt.fit(train_data)

# Make predictions
dt_predictions = dt_model.transform(test_data)

# Evaluate the model
roc_auc_dt = evaluator.evaluate(dt_predictions)
print(f"Decision Tree - ROC AUC: {roc_auc_dt}")



Decision Tree - ROC AUC: 0.4210816058940954


                                                                                

# Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="Default", numTrees=100, maxDepth=5)
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
roc_auc_rf = evaluator.evaluate(rf_predictions)
print(f"Random Forest - ROC AUC: {roc_auc_rf}")



Random Forest - ROC AUC: 0.7071944814368916


                                                                                

# Gradient-Boosted Trees

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol="scaled_features", labelCol="Default", maxIter=10, maxDepth=5)
gbt_model = gbt.fit(train_data)

# Make predictions
gbt_predictions = gbt_model.transform(test_data)

# Evaluate the model
roc_auc_gbt = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Trees - ROC AUC: {roc_auc_gbt}")



Gradient-Boosted Trees - ROC AUC: 0.7311417017458507


                                                                                

# Model Evaluation

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define evaluators
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="f1")

# Evaluate Logistic Regression
accuracy_lr = accuracy_evaluator.evaluate(lr_predictions)
precision_lr = precision_evaluator.evaluate(lr_predictions)
recall_lr = recall_evaluator.evaluate(lr_predictions)
f1_lr = f1_evaluator.evaluate(lr_predictions)

print(f"Logistic Regression - Accuracy: {accuracy_lr}, Precision: {precision_lr}, Recall: {recall_lr}, F1 Score: {f1_lr}")

# Evaluate Decision Tree
accuracy_dt = accuracy_evaluator.evaluate(dt_predictions)
precision_dt = precision_evaluator.evaluate(dt_predictions)
recall_dt = recall_evaluator.evaluate(dt_predictions)
f1_dt = f1_evaluator.evaluate(dt_predictions)

print(f"Decision Tree - Accuracy: {accuracy_dt}, Precision: {precision_dt}, Recall: {recall_dt}, F1 Score: {f1_dt}")

# Evaluate Random Forest
accuracy_rf = accuracy_evaluator.evaluate(rf_predictions)
precision_rf = precision_evaluator.evaluate(rf_predictions)
recall_rf = recall_evaluator.evaluate(rf_predictions)
f1_rf = f1_evaluator.evaluate(rf_predictions)

print(f"Random Forest - Accuracy: {accuracy_rf}, Precision: {precision_rf}, Recall: {recall_rf}, F1 Score: {f1_rf}")

# Evaluate Gradient-Boosted Trees
accuracy_gbt = accuracy_evaluator.evaluate(gbt_predictions)
precision_gbt = precision_evaluator.evaluate(gbt_predictions)
recall_gbt = recall_evaluator.evaluate(gbt_predictions)
f1_gbt = f1_evaluator.evaluate(gbt_predictions)

print(f"Gradient-Boosted Trees - Accuracy: {accuracy_gbt}, Precision: {precision_gbt}, Recall: {recall_gbt}, F1 Score: {f1_gbt}")

                                                                                

Logistic Regression - Accuracy: 0.8850444687158683, Precision: 0.8498232051102164, Recall: 0.8850444687158684, F1 Score: 0.8354146891676907


                                                                                

Decision Tree - Accuracy: 0.8822554220627243, Precision: 0.8370298576990229, Recall: 0.8822554220627243, F1 Score: 0.841067195133172


                                                                                

Random Forest - Accuracy: 0.8844008425651427, Precision: 0.7821648503299344, Recall: 0.8844008425651427, F1 Score: 0.8301469970319177




Gradient-Boosted Trees - Accuracy: 0.8852980184116087, Precision: 0.8498310312720517, Recall: 0.8852980184116087, F1 Score: 0.8382626093184488


                                                                                

# Cross validation

In [None]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# # Define parameter grid
# paramGrid = ParamGridBuilder() \
#     .addGrid(lr.regParam, [0.1, 0.01]) \
#     .addGrid(dt.maxDepth, [3, 5]) \
#     .addGrid(rf.numTrees, [50, 100]) \
#     .addGrid(gbt.maxIter, [5, 10]) \
#     .build()

# # Define cross-validator
# crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# # Fit the model
# cvModel = crossval.fit(train_data)

# # Make predictions
# cv_predictions = cvModel.transform(test_data)

# # Evaluate the model
# roc_auc_cv = evaluator.evaluate(cv_predictions)
# print(f"Cross-Validated Model - ROC AUC: {roc_auc_cv}")

                                                                                

Cross-Validated Model - ROC AUC: 0.7380937269287152
