# Raw data will go to Bronze layer

In [1]:
from minio import Minio
from minio.error import S3Error
import os

In [2]:
# Connect to minio cline
def get_minio_client():
    client = Minio(
        "localhost:9000",
        access_key = os.getenv("MINIO_USERNAME"),
        secret_key = os.getenv("MINIO_PASSWORD"),
        secure = False
    )
    return client

In [3]:
def upload_file(bucket_name, file_path):
    try:
        client = get_minio_client()

        # Check if the bucket exists, if not, create it
        if not client.bucket_exists(bucket_name):
            print(f"Bucket '{bucket_name}' does not exist. Creating it now.")
            client.make_bucket(bucket_name)

        # Ensure the file path is valid
        if not os.path.isfile(file_path):
            print(f"The file path '{file_path}' does not exist or is not a file.")
            return

        # Extract the file name from the file path
        file_name = os.path.basename(file_path)

        # Upload the file to the specified bucket
        client.fput_object(bucket_name, file_name, file_path)
        print(f"File '{file_name}' uploaded successfully to bucket '{bucket_name}'.")

    except S3Error as e:
        print(f"S3 Error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [4]:
# Upload raw data to bronze bucket

# upload_file("bronze", "/home/drissdo/Desktop/Scalable-Distributed-Systems/data/Loan_default.csv")

# Data Acquisition and Preprocessing

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean

# Run full cores
spark = SparkSession.builder \
    .appName("LoanDefaultPrediction") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "admin123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")\
    .config("spark.jars", "/home/drissdo/Desktop/Scalable-Distributed-Systems/src/jars/aws-java-sdk-bundle-1.11.901.jar, /home/drissdo/Desktop/Scalable-Distributed-Systems/src/jars/hadoop-aws-3.3.1.jar")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .getOrCreate()

24/12/11 14:14:32 WARN Utils: Your hostname, dtdat resolves to a loopback address: 127.0.1.1; using 192.168.2.12 instead (on interface wlp0s20f3)
24/12/11 14:14:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/12/11 14:14:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
data_path = "s3a://bronze/Loan_default.csv"

# Read the CSV file from S3
loan_data = spark.read.csv(data_path, inferSchema=True, header=True)

# Display the first 5 rows
loan_data.show(5)

24/12/11 14:14:35 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36|    0.44| Bachelor's|     Full-time|     Divorced|        Yes|          Yes|      Other|        Yes|      0|
|HPSK72WA7R| 69| 50432|    124440|        458|            15|             1|        4.81|      60|    0.68|   Master's|     Full-time|      Married|         No|           N

In [6]:
loan_data.columns

['LoanID',
 'Age',
 'Income',
 'LoanAmount',
 'CreditScore',
 'MonthsEmployed',
 'NumCreditLines',
 'InterestRate',
 'LoanTerm',
 'DTIRatio',
 'Education',
 'EmploymentType',
 'MaritalStatus',
 'HasMortgage',
 'HasDependents',
 'LoanPurpose',
 'HasCoSigner',
 'Default']

In [7]:
loan_data.printSchema()

root
 |-- LoanID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- MonthsEmployed: integer (nullable = true)
 |-- NumCreditLines: integer (nullable = true)
 |-- InterestRate: double (nullable = true)
 |-- LoanTerm: integer (nullable = true)
 |-- DTIRatio: double (nullable = true)
 |-- Education: string (nullable = true)
 |-- EmploymentType: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- HasMortgage: string (nullable = true)
 |-- HasDependents: string (nullable = true)
 |-- LoanPurpose: string (nullable = true)
 |-- HasCoSigner: string (nullable = true)
 |-- Default: integer (nullable = true)



24/12/11 14:14:49 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


# Preprocess the Data

In [None]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql.functions import col
from pyspark.ml import Pipeline


imputer = Imputer(
    inputCols=["Income", "MonthsEmployed", "NumCreditLines", "InterestRate", "LoanTerm", "DTIRatio"],
    outputCols=["Income_filled", "MonthsEmployed_filled", "NumCreditLines_filled", "InterestRate_filled", "LoanTerm_filled", "DTIRatio_filled"]
)
loan_data_imputed = imputer.fit(loan_data).transform(loan_data)

string_indexers = [
    StringIndexer(inputCol="Education", outputCol="Education_index"),
    StringIndexer(inputCol="EmploymentType", outputCol="EmploymentType_index"),
    StringIndexer(inputCol="MaritalStatus", outputCol="MaritalStatus_index"),
    StringIndexer(inputCol="HasMortgage", outputCol="HasMortgage_index"),
    StringIndexer(inputCol="HasDependents", outputCol="HasDependents_index"),
    StringIndexer(inputCol="LoanPurpose", outputCol="LoanPurpose_index"),
    StringIndexer(inputCol="HasCoSigner", outputCol="HasCoSigner_index")
]

pipeline_indexers = Pipeline(stages=string_indexers)
loan_data_indexed = pipeline_indexers.fit(loan_data_imputed).transform(loan_data_imputed)


In [9]:
# One-hot encode indexed columns
one_hot_encoders = [
    OneHotEncoder(inputCol="Education_index", outputCol="Education_vec"),
    OneHotEncoder(inputCol="EmploymentType_index", outputCol="EmploymentType_vec"),
    OneHotEncoder(inputCol="MaritalStatus_index", outputCol="MaritalStatus_vec"),
    OneHotEncoder(inputCol="HasMortgage_index", outputCol="HasMortgage_vec"),
    OneHotEncoder(inputCol="HasDependents_index", outputCol="HasDependents_vec"),
    OneHotEncoder(inputCol="LoanPurpose_index", outputCol="LoanPurpose_vec"),
    OneHotEncoder(inputCol="HasCoSigner_index", outputCol="HasCoSigner_vec")
]

pipeline_encoders = Pipeline(stages=one_hot_encoders)
loan_data_encoded = pipeline_encoders.fit(loan_data_indexed).transform(loan_data_indexed)

# Normalize numerical features
numerical_cols = [
    "Age", "Income_filled", "LoanAmount", "CreditScore", "MonthsEmployed_filled",
    "NumCreditLines_filled", "InterestRate_filled", "LoanTerm_filled", "DTIRatio_filled"
]

assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
loan_data_assembled = assembler.transform(loan_data_encoded)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(loan_data_assembled)
loan_data_scaled = scaler_model.transform(loan_data_assembled)

                                                                                

In [10]:
loan_data_scaled.show(5)

24/12/11 14:15:11 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+-------------+---------------------+---------------------+-------------------+---------------+---------------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+-------------+------------------+-----------------+---------------+-----------------+---------------+---------------+--------------------+--------------------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|Income_filled|MonthsEmployed_filled|NumCreditLines_filled|InterestRate_filled|LoanTerm_filled|DTIRatio_filled|Education_index|EmploymentType_index|MaritalStatus_index|HasMortgage_index|Ha

In [11]:
# Save data to sliver bucket
silver_bucket_path = "s3a://sliver/preprocessed_loan_data"
loan_data_scaled.repartition(1).write.parquet(silver_bucket_path,  mode="overwrite")

                                                                                

In [14]:
from pyspark.sql.functions import col, rand


loan_data_scaled = spark.read.parquet(silver_bucket_path)

# Ensure the split maintains the balance of defaulted and non-defaulted loans
default_data = loan_data_scaled.filter(col("Default") == 1)
non_default_data = loan_data_scaled.filter(col("Default") == 0)

train_default, test_default = default_data.randomSplit([0.8, 0.2], seed=42)
train_non_default, test_non_default = non_default_data.randomSplit([0.8, 0.2], seed=42)

train_data = train_default.union(train_non_default)
test_data = test_default.union(test_non_default)

# Shuffle the data
train_data = train_data.orderBy(rand(seed=42))
test_data = test_data.orderBy(rand(seed=42))

In [15]:
test_data.show()



+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+-------------+---------------------+---------------------+-------------------+---------------+---------------+---------------+--------------------+-------------------+-----------------+-------------------+-----------------+-----------------+-------------+------------------+-----------------+---------------+-----------------+---------------+---------------+--------------------+--------------------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|Income_filled|MonthsEmployed_filled|NumCreditLines_filled|InterestRate_filled|LoanTerm_filled|DTIRatio_filled|Education_index|EmploymentType_index|MaritalStatus_index|HasMortgage_index|Ha

                                                                                

# Logistic Regression

In [15]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol="scaled_features", labelCol="Default", maxIter=10)
lr_model = lr.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="Default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc_lr = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression - ROC AUC: {roc_auc_lr}")

                                                                                

Logistic Regression - ROC AUC: 0.7336549215886313


In [27]:
# Save the Logistic Regression model
model_path = "/home/drissdo/Desktop/Scalable-Distributed-Systems/model"

# Check if the path already exists and handle it (e.g., overwrite or raise an error)
import os
if os.path.exists(model_path):
    print(f"Path {model_path} already exists. Consider removing it or choosing a new path.")

# Save the model
lr_model.write().overwrite().save(model_path)


Path /home/drissdo/Desktop/Scalable-Distributed-Systems/model already exists. Consider removing it or choosing a new path.


In [31]:
from pyspark.ml.classification import LogisticRegressionModel

# Load the saved model
loadedModel = LogisticRegressionModel.load(model_path)

# Verify by making predictions using the loaded model
lr_predictions = loadedModel.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="Default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc_lr = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression - ROC AUC: {roc_auc_lr}")

                                                                                

Logistic Regression - ROC AUC: 0.7336530388666921


# Decision Tree

In [16]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol="scaled_features", labelCol="Default", maxDepth=10)
dt_model = dt.fit(train_data)

# Make predictions
dt_predictions = dt_model.transform(test_data)

# Evaluate the model
roc_auc_dt = evaluator.evaluate(dt_predictions)
print(f"Decision Tree - ROC AUC: {roc_auc_dt}")



Decision Tree - ROC AUC: 0.4210816058940954


                                                                                

# Random Forest

In [17]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="Default", numTrees=100, maxDepth=5)
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
roc_auc_rf = evaluator.evaluate(rf_predictions)
print(f"Random Forest - ROC AUC: {roc_auc_rf}")



Random Forest - ROC AUC: 0.7071944814368916


                                                                                

# Gradient-Boosted Trees

In [18]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol="scaled_features", labelCol="Default", maxIter=10, maxDepth=5)
gbt_model = gbt.fit(train_data)

# Make predictions
gbt_predictions = gbt_model.transform(test_data)

# Evaluate the model
roc_auc_gbt = evaluator.evaluate(gbt_predictions)
print(f"Gradient-Boosted Trees - ROC AUC: {roc_auc_gbt}")



Gradient-Boosted Trees - ROC AUC: 0.7311417017458507


                                                                                

# Model Evaluation

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define evaluators
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="Default", predictionCol="prediction", metricName="f1")

# Evaluate Logistic Regression
accuracy_lr = accuracy_evaluator.evaluate(lr_predictions)
precision_lr = precision_evaluator.evaluate(lr_predictions)
recall_lr = recall_evaluator.evaluate(lr_predictions)
f1_lr = f1_evaluator.evaluate(lr_predictions)

print(f"Logistic Regression - Accuracy: {accuracy_lr}, Precision: {precision_lr}, Recall: {recall_lr}, F1 Score: {f1_lr}")

# Evaluate Decision Tree
accuracy_dt = accuracy_evaluator.evaluate(dt_predictions)
precision_dt = precision_evaluator.evaluate(dt_predictions)
recall_dt = recall_evaluator.evaluate(dt_predictions)
f1_dt = f1_evaluator.evaluate(dt_predictions)

print(f"Decision Tree - Accuracy: {accuracy_dt}, Precision: {precision_dt}, Recall: {recall_dt}, F1 Score: {f1_dt}")

# Evaluate Random Forest
accuracy_rf = accuracy_evaluator.evaluate(rf_predictions)
precision_rf = precision_evaluator.evaluate(rf_predictions)
recall_rf = recall_evaluator.evaluate(rf_predictions)
f1_rf = f1_evaluator.evaluate(rf_predictions)

print(f"Random Forest - Accuracy: {accuracy_rf}, Precision: {precision_rf}, Recall: {recall_rf}, F1 Score: {f1_rf}")

# Evaluate Gradient-Boosted Trees
accuracy_gbt = accuracy_evaluator.evaluate(gbt_predictions)
precision_gbt = precision_evaluator.evaluate(gbt_predictions)
recall_gbt = recall_evaluator.evaluate(gbt_predictions)
f1_gbt = f1_evaluator.evaluate(gbt_predictions)

print(f"Gradient-Boosted Trees - Accuracy: {accuracy_gbt}, Precision: {precision_gbt}, Recall: {recall_gbt}, F1 Score: {f1_gbt}")

                                                                                

Logistic Regression - Accuracy: 0.8850444687158683, Precision: 0.8498232051102164, Recall: 0.8850444687158684, F1 Score: 0.8354146891676907


                                                                                

Decision Tree - Accuracy: 0.8822554220627243, Precision: 0.8370298576990229, Recall: 0.8822554220627243, F1 Score: 0.841067195133172


                                                                                

Random Forest - Accuracy: 0.8844008425651427, Precision: 0.7821648503299344, Recall: 0.8844008425651427, F1 Score: 0.8301469970319177




Gradient-Boosted Trees - Accuracy: 0.8852980184116087, Precision: 0.8498310312720517, Recall: 0.8852980184116087, F1 Score: 0.8382626093184488


                                                                                

# Cross validation

In [None]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# # Define parameter grid
# paramGrid = ParamGridBuilder() \
#     .addGrid(lr.regParam, [0.1, 0.01]) \
#     .addGrid(dt.maxDepth, [3, 5]) \
#     .addGrid(rf.numTrees, [50, 100]) \
#     .addGrid(gbt.maxIter, [5, 10]) \
#     .build()

# # Define cross-validator
# crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# # Fit the model
# cvModel = crossval.fit(train_data)

# # Make predictions
# cv_predictions = cvModel.transform(test_data)

# # Evaluate the model
# roc_auc_cv = evaluator.evaluate(cv_predictions)
# print(f"Cross-Validated Model - ROC AUC: {roc_auc_cv}")

                                                                                

Cross-Validated Model - ROC AUC: 0.7380937269287152
