In [1]:
from kafka import KafkaConsumer
import json
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("LoanPredictionConsumer") \
    .getOrCreate()

24/12/26 23:45:21 WARN Utils: Your hostname, dtdat resolves to a loopback address: 127.0.1.1; using 192.168.2.12 instead (on interface wlp0s20f3)
24/12/26 23:45:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/26 23:45:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark

In [3]:
schema = StructType([
    StructField("LoanID", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Income", IntegerType(), True),
    StructField("LoanAmount", IntegerType(), True),
    StructField("CreditScore", IntegerType(), True),
    StructField("MonthsEmployed", IntegerType(), True),
    StructField("NumCreditLines", IntegerType(), True),
    StructField("InterestRate", DoubleType(), True),
    StructField("LoanTerm", IntegerType(), True),
    StructField("DTIRatio", DoubleType(), True),
    StructField("Education", StringType(), True),
    StructField("EmploymentType", StringType(), True),
    StructField("MaritalStatus", StringType(), True),
    StructField("HasMortgage", StringType(), True),
    StructField("HasDependents", StringType(), True),
    StructField("LoanPurpose", StringType(), True),
    StructField("HasCoSigner", StringType(), True),
    StructField("Default", IntegerType(), True)
])

In [4]:
consumer = KafkaConsumer(
    'loan_application',
    bootstrap_servers='localhost:9092',
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

In [5]:
from pyspark.ml.classification import LogisticRegressionModel
model_path = "/home/drissdo/Desktop/Scalable-Distributed-Systems/ML/model"
lr_model = LogisticRegressionModel.load(model_path)

In [6]:
def indexing_data(loan_data):
    education_map = {'High School':1, 'Bachelor':0, 'Master':2, 'PhD':3}
    employment_map = {'Full-time':3, 'Part-time':0, 'Self-employed':2, 'Unemployed':1}
    marital_status_map = {'Single': 2, 'Married': 0, 'Divorced': 1}
    has_mortgage_map = {'Yes': 1, 'No': 0}
    has_dependents_map = {'Yes': 1, 'No': 0}
    loan_purpose_map = {'Home':1, 'Other': 3, 'Education':2, 'Business':0, 'Auto': 4}
    has_cosigner_map = {'Yes': 1, 'No': 0}
    
    loan_data['Education'] = education_map[loan_data['Education']]
    loan_data['EmploymentType'] = employment_map[loan_data['EmploymentType']]
    loan_data['MaritalStatus'] = marital_status_map[loan_data['MaritalStatus']]
    loan_data['HasMortgage'] = has_mortgage_map[loan_data['HasMortgage']]
    loan_data['HasDependents'] = has_dependents_map[loan_data['HasDependents']]
    loan_data['LoanPurpose'] = loan_purpose_map[loan_data['LoanPurpose']]
    loan_data['HasCoSigner'] = has_cosigner_map[loan_data['HasCoSigner']]
    
    
    return loan_data
    

In [7]:
for message in consumer:
    loan_data = message.value
    
    print("Before")
    print(loan_data)
    
    indexed_data = indexing_data(loan_data= loan_data)
    
    print("-"*50)
    print("After")
    print(indexed_data)
    
    

Before
{'LoanID': '368bafbc-73e1-46e1-ab17-2f3e103fd225', 'Age': 66, 'Income': 47996, 'LoanAmount': 80890, 'CreditScore': 565, 'MonthsEmployed': 251, 'NumCreditLines': 9, 'InterestRate': 21.12, 'LoanTerm': 60, 'DTIRatio': 0.43, 'Education': 'Master', 'EmploymentType': 'Self-employed', 'MaritalStatus': 'Divorced', 'HasMortgage': 'No', 'HasDependents': 'No', 'LoanPurpose': 'Business', 'HasCoSigner': 'No'}
--------------------------------------------------
After
{'LoanID': '368bafbc-73e1-46e1-ab17-2f3e103fd225', 'Age': 66, 'Income': 47996, 'LoanAmount': 80890, 'CreditScore': 565, 'MonthsEmployed': 251, 'NumCreditLines': 9, 'InterestRate': 21.12, 'LoanTerm': 60, 'DTIRatio': 0.43, 'Education': 2, 'EmploymentType': 2, 'MaritalStatus': 1, 'HasMortgage': 0, 'HasDependents': 0, 'LoanPurpose': 0, 'HasCoSigner': 0}
Before
{'LoanID': 'd9c2827d-26c2-4189-a8c5-5a9857eb0b68', 'Age': 37, 'Income': 137515, 'LoanAmount': 25274, 'CreditScore': 449, 'MonthsEmployed': 174, 'NumCreditLines': 3, 'InterestRat

24/12/26 23:45:38 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


KeyboardInterrupt: 

In [8]:
loan_data = message.value

In [28]:
loan_data ={'LoanID': '7df5ac27-0ecd-4ad3-b28e-5bf2964e3ea6',
 'Age': 56,
 'Income': 85994,
 'LoanAmount': 50587,
 'CreditScore': 520,
 'MonthsEmployed': 80,
 'NumCreditLines': 4,
 'InterestRate': 15.23,
 'LoanTerm': 36,
 'DTIRatio': 0.44,
 'Education': 0,
 'EmploymentType': 3,
 'MaritalStatus': 1,
 'HasMortgage': 0,
 'HasDependents': 0,
 'LoanPurpose': 3,
 'HasCoSigner': 0}

In [29]:
data = df = spark.createDataFrame([loan_data])

In [30]:
data.columns

['Age',
 'CreditScore',
 'DTIRatio',
 'Education',
 'EmploymentType',
 'HasCoSigner',
 'HasDependents',
 'HasMortgage',
 'Income',
 'InterestRate',
 'LoanAmount',
 'LoanID',
 'LoanPurpose',
 'LoanTerm',
 'MaritalStatus',
 'MonthsEmployed',
 'NumCreditLines']

In [31]:
# Normalize numerical features
numerical_cols = [
    "Age", "Income", "LoanAmount", "CreditScore", "MonthsEmployed",
    "NumCreditLines", "InterestRate", "LoanTerm", "DTIRatio"
]

assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
loan_data_assembled = assembler.transform(data)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(loan_data_assembled)
loan_data_scaled = scaler_model.transform(loan_data_assembled)

In [32]:
loan_data_scaled.show()

+---+-----------+--------+---------+--------------+-----------+-------------+-----------+------+------------+----------+--------------------+-----------+--------+-------------+--------------+--------------+--------------------+--------------------+
|Age|CreditScore|DTIRatio|Education|EmploymentType|HasCoSigner|HasDependents|HasMortgage|Income|InterestRate|LoanAmount|              LoanID|LoanPurpose|LoanTerm|MaritalStatus|MonthsEmployed|NumCreditLines|            features|     scaled_features|
+---+-----------+--------+---------+--------------+-----------+-------------+-----------+------+------------+----------+--------------------+-----------+--------+-------------+--------------+--------------+--------------------+--------------------+
| 56|        520|    0.44|        0|             3|          0|            0|          0| 85994|       15.23|     50587|7df5ac27-0ecd-4ad...|          3|      36|            1|            80|             4|[56.0,85994.0,505...|[0.0,0.0,0.0,0.0,...|
+---

In [33]:
lr_predictions = lr_model.transform(loan_data_scaled)

In [35]:
lr_predictions.show()

+---+-----------+--------+---------+--------------+-----------+-------------+-----------+------+------------+----------+--------------------+-----------+--------+-------------+--------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
|Age|CreditScore|DTIRatio|Education|EmploymentType|HasCoSigner|HasDependents|HasMortgage|Income|InterestRate|LoanAmount|              LoanID|LoanPurpose|LoanTerm|MaritalStatus|MonthsEmployed|NumCreditLines|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-----------+--------+---------+--------------+-----------+-------------+-----------+------+------------+----------+--------------------+-----------+--------+-------------+--------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
| 56|        520|    0.44|        0|             3|          0|            0|          0| 8599

24/12/27 00:04:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
