In [1]:
import pandas as pd

df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,Age,Gender,Education,Income,Debt,Credit_Score,Loan_Amount,Loan_Term,Num_Credit_Cards,Payment_History,Employment_Status,Residence_Type,Marital_Status,Creditworthiness
0,56,Female,Master,149406,34089,581,49200,60,4,Bad,Unemployed,Rented,Single,1
1,69,Female,High School,78896,8626,648,20147,24,7,Good,Employed,Mortgaged,Married,1
2,46,Female,Master,119339,46281,329,41307,12,8,Bad,Unemployed,Owned,Single,1
3,32,Male,High School,131067,29403,816,19019,60,8,Bad,Employed,Owned,Single,1
4,60,Male,PhD,38001,30032,673,16317,36,4,Average,Employed,Rented,Married,0


In [2]:
target_col = "Creditworthiness"
feature_cols = [col for col in df.columns if col != target_col]

X = df[feature_cols]
y = df[target_col]

X.head(), y.head()

(   Age  Gender    Education  Income   Debt  Credit_Score  Loan_Amount  \
 0   56  Female       Master  149406  34089           581        49200   
 1   69  Female  High School   78896   8626           648        20147   
 2   46  Female       Master  119339  46281           329        41307   
 3   32    Male  High School  131067  29403           816        19019   
 4   60    Male          PhD   38001  30032           673        16317   
 
    Loan_Term  Num_Credit_Cards Payment_History Employment_Status  \
 0         60                 4             Bad        Unemployed   
 1         24                 7            Good          Employed   
 2         12                 8             Bad        Unemployed   
 3         60                 8             Bad          Employed   
 4         36                 4         Average          Employed   
 
   Residence_Type Marital_Status  
 0         Rented         Single  
 1      Mortgaged        Married  
 2          Owned         Single 

In [3]:
numeric_features = ["Age", "Income", "Debt", "Credit_Score",
                    "Loan_Amount", "Loan_Term", "Num_Credit_Cards"]
categorical_features = ["Gender", "Education", "Payment_History",
                        "Employment_Status", "Residence_Type", "Marital_Status"]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((9600, 13), (2400, 13))

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

rf_clf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            n_jobs=-1,
            class_weight="balanced_subsample",
            random_state=42,
        )),
    ]
)

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
y_proba = rf_clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Accuracy: 0.7020833333333333
ROC-AUC: 0.49860650072603185
              precision    recall  f1-score   support

           0       0.33      0.00      0.00       714
           1       0.70      1.00      0.82      1686

    accuracy                           0.70      2400
   macro avg       0.52      0.50      0.41      2400
weighted avg       0.59      0.70      0.58      2400



In [9]:
from sklearn.metrics import classification_report

y_proba = rf_clf.predict_proba(X_test)[:, 1]
for thr in [0.5, 0.4, 0.3, 0.2]:
    y_pred_thr = (y_proba > thr).astype(int)
    print(f"\nThreshold = {thr}")
    print(classification_report(y_test, y_pred_thr))



Threshold = 0.5
              precision    recall  f1-score   support

           0       0.33      0.00      0.00       714
           1       0.70      1.00      0.82      1686

    accuracy                           0.70      2400
   macro avg       0.52      0.50      0.41      2400
weighted avg       0.59      0.70      0.58      2400


Threshold = 0.4
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       714
           1       0.70      1.00      0.83      1686

    accuracy                           0.70      2400
   macro avg       0.35      0.50      0.41      2400
weighted avg       0.49      0.70      0.58      2400


Threshold = 0.3
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       714
           1       0.70      1.00      0.83      1686

    accuracy                           0.70      2400
   macro avg       0.35      0.50      0.41      2400
weighted avg       0.49

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
y_train.value_counts(), y_train.value_counts(normalize=True)


(Creditworthiness
 1    6745
 0    2855
 Name: count, dtype: int64,
 Creditworthiness
 1    0.702604
 0    0.297396
 Name: proportion, dtype: float64)

In [11]:
import numpy as np

y_proba = rf_clf.predict_proba(X_test)[:, 1]
np.percentile(y_proba, [0, 25, 50, 75, 90, 95, 99])


array([0.45      , 0.66666667, 0.70666667, 0.74333333, 0.77366667,
       0.79016667, 0.82      ])

The RandomForest model assigns very similar and relatively high probabilities to almost all samples (0.45â€“0.82). As a result, the classifier effectively predicts only the majority class (1) for all test cases, leading to high accuracy but ROC-AUC close to 0.5 and zero recall for the minority class (0). This suggests that, given the current synthetic dataset and features, the model cannot meaningfully discriminate between creditworthy and non-creditworthy clients.