In [28]:
import pandas as pd
import numpy as np
from credit_score_2.cs2_preprocessing import Cs2DataSetPreProcessing

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score, cross_validate
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


In [30]:
credit_df = pd.read_csv("credit_score_dataset_2.csv", low_memory=False)

credit_df = Cs2DataSetPreProcessing.process(credit_df)

In [31]:
y = credit_df["credit_score"]
X = credit_df.drop("credit_score", axis=1)

y.unique()

array(['good', 'standard', 'poor'], dtype=object)

# Processing

## OneHotEnconding

In [32]:
categorical_cols = X.select_dtypes("object").columns

col_trans = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(), categorical_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)


## Standard Scaler

In [33]:
scaler = StandardScaler()

# Feature Selection

In [34]:
rfe = RFE(estimator=LogisticRegression(), step=1)

# Models

## K-nearest neighbors

In [35]:
knn = KNeighborsClassifier()

knn_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("rfe", rfe),
        ("clf", knn)
    ]
)

param_grid = {
    "rfe__n_features_to_select": [6, 7, 8],
    "clf__n_neighbors": [2, 3, 5,],
}

grid_search_knn = GridSearchCV(
    knn_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)


outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results_knn = cross_validate(
    estimator=grid_search_knn,
    X=X,
    y=y,
    cv=outer_valid,
    return_estimator=True
)

print(f"accuracy: {results_knn['test_score']}")
print(f"accuracy: {results_knn['estimator'][0].best_params_}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
accuracy: [0.79576]
accuracy: {'clf__n_neighbors': 2, 'rfe__n_features_to_select': 7}


In [36]:
mask_features = results_knn["estimator"][0].best_estimator_.named_steps["rfe"].get_support()
cols = results_knn["estimator"][0].best_estimator_.named_steps["cols_trans"].get_feature_names_out()

cols[mask_features]


array(['num_bank_accounts', 'num_credit_card', 'num_of_loan',
       'num_of_delayed_payment', 'outstanding_debt', 'credit_history_age',
       'total_emi_per_month'], dtype=object)

In [37]:
X_rfe = X[cols[mask_features]]

## Logistic Regression

In [38]:
logreg = SGDClassifier(loss="log_loss", max_iter=10000)

logreg_pipe = Pipeline(
    [
        ("scaler", scaler),
        ("logreg", logreg)
    ]
)

param_grid = {
    "logreg__alpha": np.logspace(-5, 1, 5),
    "logreg__l1_ratio": np.linspace(0, 1, 5),
}

grid_search_logreg = GridSearchCV(
    logreg_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results_log = cross_validate(
    estimator=grid_search_logreg,
    X=X_rfe,
    y=y,
    cv=5,
    return_estimator=True
)

print(f"accuracy: {results_log['test_score'].mean()}")
print(f"accuracy: {results_log['estimator'][0].best_params_}")



Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
accuracy: 0.5931599999999999
accuracy: {'logreg__alpha': np.float64(1e-05), 'logreg__l1_ratio': np.float64(0.5)}


## HistGradientBoostingClassifier


In [39]:
hgbc = HistGradientBoostingClassifier()

param_grid = {
    "clf__learning_rate": [0.1],
    "clf__max_iter": [150, 175, 200],
    "clf__max_leaf_nodes": [
        210,
        220,
        230,
    ],
    "clf__min_samples_leaf": [5, 7, 10],
}
hgbc_pipe = Pipeline(
    [
        ("clf", hgbc),
    ]
)

grid_search_hgbc = GridSearchCV(
    hgbc_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results = cross_validate(
    estimator=grid_search_hgbc, X=X_rfe, y=y, cv=outer_valid, return_estimator=True
)

print(f"accuracy: {results['test_score']}")
print(f"accuracy: {results['estimator'][0].best_params_}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
accuracy: [0.80052]
accuracy: {'clf__learning_rate': 0.1, 'clf__max_iter': 175, 'clf__max_leaf_nodes': 220, 'clf__min_samples_leaf': 7}


# Results

The HistGradientBoostingClassifier seems to be slightly better than the other models in terms of accuracy. But I will choose the KNN because it uses a fraction of the features due to the feature selection. This choice aims to enhance the user experience in the frontend - fewer questions, more chances that the user will answer, or less time it consumes.