In [1]:
import pandas as pd
import numpy as np
from processing import Cs2DataSetPreProcessing

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score, cross_validate
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


In [3]:
credit_df = pd.read_csv("credit_score_dataset_2.csv", low_memory=False)

credit_df = Cs2DataSetPreProcessing.process(credit_df)

In [4]:
y = credit_df["Credit_Score"]
X = credit_df.drop("Credit_Score", axis=1)

y.unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

# Processing

## OneHotEnconding

In [5]:
categorical_cols = X.select_dtypes("object").columns

col_trans = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(), categorical_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)


## Standard Scaler

In [6]:
scaler = StandardScaler()

# Feature Selection

In [7]:
rfe = RFE(estimator=LogisticRegression(), step=1)

# Models

## Logistic Regression

In [33]:
logreg = SGDClassifier(loss="log_loss", max_iter=10000)

logreg_pipe = Pipeline(
    [
        ("scaler", scaler),
        ("logreg", logreg)
    ]
)

param_grid = {
    "logreg__alpha": np.logspace(-5, 1, 5),
    "logreg__l1_ratio": np.linspace(0, 1, 5),
}

grid_search_logreg = GridSearchCV(
    logreg_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results_log = cross_validate(
    estimator=grid_search_logreg,
    X=X_rfe,
    y=y,
    cv=5,
    return_estimator=True
)

print(f"accuracy: {results_log['test_score'].mean()}")
print(f"accuracy: {results_log['estimator'][0].best_params_}")



accuracy: 0.58902
accuracy: {'logreg__alpha': np.float64(1e-05), 'logreg__l1_ratio': np.float64(0.0)}


## K-nearest neighbors

In [9]:
knn = KNeighborsClassifier()

knn_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("rfe", rfe),
        ("clf", knn)
    ]
)

param_grid = {
    "rfe__n_features_to_select": [6, 7, 8],
    "clf__n_neighbors": [2, 3, 5,],
}

grid_search_knn = GridSearchCV(
    knn_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)


outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results_knn = cross_validate(
    estimator=grid_search_knn,
    X=X,
    y=y,
    cv=outer_valid,
    return_estimator=True
)

print(f"accuracy: {results_knn['test_score']}")
print(f"accuracy: {results_knn['estimator'][0].best_params_}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=6; total time=   8.3s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=6; total time=   8.6s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=7; total time=   8.5s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=7; total time=   9.1s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=7; total time=   9.4s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=6; total time=  10.4s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=6; total time=  10.3s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=8; total time=   9.7s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=6; total time=  10.8s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=8; total time=  10.0s
[CV] END ....clf__n_neighbors=2, rfe__n_features_to_select=7; total time=  10.6s
[CV] END ....clf__n_neighbors=2, rfe__n_features_

In [10]:
mask_features = results_knn["estimator"][0].best_estimator_.named_steps["rfe"].get_support()
cols = results_knn["estimator"][0].best_estimator_.named_steps["cols_trans"].get_feature_names_out()

cols[mask_features]


array(['Num_Bank_Accounts', 'Num_Credit_Card', 'Num_of_Loan',
       'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Credit_History_Age',
       'Total_EMI_per_month'], dtype=object)

## HistGradientBoostingClassifier


In [12]:
X_rfe = X[cols[mask_features]]

In [20]:
hgbc = HistGradientBoostingClassifier()

param_grid = {
    "clf__learning_rate": [0.1],
    "clf__max_iter": [150, 175, 200],
    "clf__max_leaf_nodes": [
        210,
        220,
        230,
    ],
    "clf__min_samples_leaf": [5, 7, 10],
}
hgbc_pipe = Pipeline(
    [
        ("clf", hgbc),
    ]
)

grid_search_hgbc = GridSearchCV(
    hgbc_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results = cross_validate(
    estimator=grid_search_hgbc, X=X_rfe, y=y, cv=outer_valid, return_estimator=True
)

print(f"accuracy: {results['test_score']}")
print(f"accuracy: {results['estimator'][0].best_params_}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=7; total time=   8.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=7; total time=   8.2s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=5; total time=   8.5s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=7; total time=   8.5s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=5; total time=   8.7s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=5; total time=   8.7s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_samples_leaf=7; total time=   8.7s
[CV] END clf__learning_rate=0.1, clf__max_iter=150, clf__max_leaf_nodes=210, clf__min_sampl

# Results

The HistGradientBoostingClassifier seems to be slightly better than the other models in terms of accuracy.