In [11]:
!pip install pyspark
import pyspark
print(pyspark.__version__)
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CreditScoreLogisticRegression") \
    .getOrCreate()
df = spark.read.csv(
    "/content/credit_data.csv",
    header=True,
    inferSchema=True
)

df.show(5)
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CreditScoreLogisticRegression") \
    .getOrCreate()

print("Spark Session Created")
df = spark.read.csv(
    "/content/credit_data.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()
import pandas as pd
import numpy as np

np.random.seed(42)
rows = 500

data = {
    "Income": np.random.randint(20000, 150000, rows),
    "Loan_Amount": np.random.randint(5000, 50000, rows),
    "Repayment_History": np.random.randint(0, 2, rows),  # 0=Bad, 1=Good
    "Credit_Utilization": np.random.uniform(0.1, 0.9, rows),
    "Credit_Score": np.random.randint(0, 2, rows)  # Target variable
}

df = pd.DataFrame(data)

df.to_csv("/content/credit_data.csv", index=False)

df.head()





!ls /content
df = spark.read.csv(
    "/content/credit_data.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()

from pyspark.ml.feature import VectorAssembler
feature_columns = [
    "Income",
    "Loan_Amount",
    "Repayment_History",
    "Credit_Utilization"
]

assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)

final_data = assembler.transform(df)




4.0.1
+------+-----------+-----------------+-------------------+------------+
|Income|Loan_Amount|Repayment_History| Credit_Utilization|Credit_Score|
+------+-----------+-----------------+-------------------+------------+
|141958|      40743|                1| 0.5999807896408419|           0|
| 35795|      11102|                1| 0.4021807579521355|           1|
| 20860|      24778|                1| 0.7692518506338664|           1|
|123694|      31641|                1| 0.5699628833872595|           1|
|148106|      39584|                0|0.33523574337677725|           0|
+------+-----------+-----------------+-------------------+------------+
only showing top 5 rows
Spark Session Created
+------+-----------+-----------------+-------------------+------------+
|Income|Loan_Amount|Repayment_History| Credit_Utilization|Credit_Score|
+------+-----------+-----------------+-------------------+------------+
|141958|      40743|                1| 0.5999807896408419|           0|
| 35795|    

In [16]:
print(type(df))
df.count()


<class 'pyspark.sql.classic.dataframe.DataFrame'>


500

In [17]:
from pyspark.ml.feature import VectorAssembler

feature_columns = [
    "Income",
    "Loan_Amount",
    "Repayment_History",
    "Credit_Utilization"
]

assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)

final_data = assembler.transform(df)


In [18]:
final_data.cache()
final_data.count()
final_data.select("features", "Credit_Score").show(5, truncate=False)


+------------------------------------------+------------+
|features                                  |Credit_Score|
+------------------------------------------+------------+
|[141958.0,40743.0,1.0,0.5999807896408419] |0           |
|[35795.0,11102.0,1.0,0.4021807579521355]  |1           |
|[20860.0,24778.0,1.0,0.7692518506338664]  |1           |
|[123694.0,31641.0,1.0,0.5699628833872595] |1           |
|[148106.0,39584.0,0.0,0.33523574337677725]|0           |
+------------------------------------------+------------+
only showing top 5 rows


In [19]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

print("Training Records:", train_data.count())
print("Testing Records:", test_data.count())


Training Records: 426
Testing Records: 74


In [20]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="Credit_Score"
)

lr_model = lr.fit(train_data)
print("Model Coefficients:", lr_model.coefficients)
print("Model Intercept:", lr_model.intercept)


Model Coefficients: [5.003160858182272e-06,5.5006022454667904e-06,-0.32442134763711467,-0.6830453505744233]
Model Intercept: -0.1284263018797897


In [21]:
predictions = lr_model.transform(test_data)
predictions.select("features", "Credit_Score", "prediction", "probability").show(5)


+--------------------+------------+----------+--------------------+
|            features|Credit_Score|prediction|         probability|
+--------------------+------------+----------+--------------------+
|[20769.0,45328.0,...|           1|       0.0|[0.51660050150436...|
|[21802.0,32355.0,...|           1|       0.0|[0.54925598533275...|
|[22200.0,26949.0,...|           0|       0.0|[0.65843141611176...|
|[22747.0,27386.0,...|           1|       0.0|[0.56913189284134...|
|[23561.0,10052.0,...|           1|       0.0|[0.69880449099728...|
+--------------------+------------+----------+--------------------+
only showing top 5 rows


In [22]:
predictions = lr_model.transform(test_data)

predictions.select(
    "Credit_Score",
    "prediction",
    "probability"
).show(5, truncate=False)


+------------+----------+----------------------------------------+
|Credit_Score|prediction|probability                             |
+------------+----------+----------------------------------------+
|1           |0.0       |[0.5166005015043691,0.4833994984956309] |
|1           |0.0       |[0.5492559853327502,0.4507440146672498] |
|0           |0.0       |[0.6584314161117661,0.34156858388823386]|
|1           |0.0       |[0.5691318928413431,0.43086810715865687]|
|1           |0.0       |[0.6988044909972809,0.3011955090027191] |
+------------+----------+----------------------------------------+
only showing top 5 rows


In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="Credit_Score",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = accuracy_evaluator.evaluate(predictions)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.4594594594594595


In [24]:
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Credit_Score",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Credit_Score",
    predictionCol="prediction",
    metricName="weightedRecall"
)

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

print("Precision:", precision)
print("Recall:", recall)


Precision: 0.45854531001589827
Recall: 0.45945945945945943


In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

roc_evaluator = BinaryClassificationEvaluator(
    labelCol="Credit_Score",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = roc_evaluator.evaluate(predictions)
print("Area Under ROC (AUC):", auc)


Area Under ROC (AUC): 0.5153508771929824


In [27]:
feature_names = ["Income", "Loan_Amount", "Repayment_History", "Credit_Utilization"]

for name, coef in zip(feature_names, lr_model.coefficients):
    print(f"{name}: {coef}")


Income: 5.003160858182272e-06
Loan_Amount: 5.5006022454667904e-06
Repayment_History: -0.32442134763711467
Credit_Utilization: -0.6830453505744233


In [28]:
lr_model.write().overwrite().save("/content/credit_score_lr_model")
print("Model saved successfully")


Model saved successfully


In [29]:
from pyspark.ml.classification import LogisticRegressionModel

loaded_model = LogisticRegressionModel.load("/content/credit_score_lr_model")
print("Model loaded successfully")


Model loaded successfully


In [30]:
from pyspark.sql import Row

new_customer = spark.createDataFrame([
    Row(Income=85000, Loan_Amount=30000, Repayment_History=1, Credit_Utilization=0.45)
])

new_customer_features = assembler.transform(new_customer)

prediction = loaded_model.transform(new_customer_features)

prediction.select("features", "probability", "prediction").show(truncate=False)


+--------------------------+---------------------------------------+----------+
|features                  |probability                            |prediction|
+--------------------------+---------------------------------------+----------+
|[85000.0,30000.0,1.0,0.45]|[0.5423808934298081,0.4576191065701919]|0.0       |
+--------------------------+---------------------------------------+----------+

