# PD Scoring Notebook

In [1]:
import pandas as pd
import joblib

# Loading the trained model
bundle = joblib.load("pd_pipeline.joblib")

pipeline = bundle["pipeline"]
id_col = bundle["id_col"]
target_col = bundle["target_col"]

print(type(pipeline))
print(id_col, target_col)

<class 'sklearn.pipeline.Pipeline'>
LoanID Default


In [2]:
# Read the whole dataset into a DataFrame
df = pd.read_csv("data/Loan_default.csv")
print(df.shape)
df.head()

(255347, 18)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [3]:
# Check whether all the required columns are present in the new dataset
required_cols = {id_col, target_col}
missing = required_cols - set(df.columns)
if missing:
        raise ValueError(f"Missing required columns: {missing}")
print("Required columns are present")

Required columns are present


In [4]:
# As in the training notebook the ID and the target columns are dropped out
loan_ids = df[id_col].copy()

X_all = df.drop(columns=[target_col, id_col]).copy()

print("Scoring feature matrix shape: ", X_all.shape)
X_all.head()

Scoring feature matrix shape:  (255347, 16)


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No


In [17]:
# Predict the risk scores for all rows with the trained model
risk_score = pipeline.predict_proba(X_all)[:, 1]
print("Scores generated: ", len(risk_score))
print("Min score: ", risk_score.min())
print("Max score: ", risk_score.max())

Scores generated:  255347
Min score:  0.017107410254387882
Max score:  0.9585530023183334


In [19]:
# Create scoring output table, connect risk_scores with IDs to use it with the dataset
score_table = pd.DataFrame({
    "LoanID": loan_ids,
    "risk_score": risk_score
})

score_table.head()

Unnamed: 0,LoanID,risk_score
0,I38PQUQS96,0.182435
1,HPSK72WA7R,0.171085
2,C1OZ6DPJ8Y,0.657342
3,V2KKSFM3UN,0.587982
4,EY08JDHTZP,0.418393


In [25]:
# Sanity checks
print("Unique LoanID count: ", score_table["LoanID"].nunique())
print("Total rows: ", len(score_table))
print("Missing risk_score:", score_table["risk_score"].isna().sum())
score_table["risk_score"].describe()

Unique LoanID count:  255347
Total rows:  255347
Missing risk_score: 0


count    255347.000000
mean          0.429949
std           0.203914
min           0.017107
25%           0.264795
50%           0.416443
75%           0.584241
max           0.958553
Name: risk_score, dtype: float64

In [20]:
# Save data to CSV
output_path = "data/loan_risk_scores.csv"
score_table.to_csv(output_path, index=False)
print(f"Saved: {output_path}")
print("Rows saved:", len(score_table))

Saved: data/loan_risk_scores.csv
Rows saved: 255347
