In [37]:
import pandas as pd
import numpy as np

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score, cross_validate
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [39]:
from credit_score_1.cs1_preprocessing import Cs1DataSetPreProcessing



In [40]:
credit_df = pd.read_csv("credit_score_dataset_1.csv")

credit_df = Cs1DataSetPreProcessing.process(credit_df)


y = credit_df["credit_score"]
X = credit_df.drop("credit_score", axis=1)

y.unique()

array(['High', 'Average', 'Low'], dtype=object)

# Processing

## OneHotEnconding

In [41]:
categorical_cols = X.select_dtypes("object").columns

col_trans = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(), categorical_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)


## Standard Scaler

In [42]:
scaler = StandardScaler()

## RFE

In [43]:
rfe = RFE(LogisticRegression(), step=1)

# Models

## Logistic Regression

In [44]:
logreg = LogisticRegression(dual=False, max_iter=10000, penalty="elasticnet", solver="saga")

logreg_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("rfe", rfe),
        ("logreg", logreg)
    ]
)

param_grid = {
    "rfe__n_features_to_select": [6, 7, 8],
    "logreg__C": np.linspace(0.1, 5, 10),
    "logreg__l1_ratio": np.linspace(0, 1, 5),
}

grid_search_logreg = GridSearchCV(
    logreg_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)


outer_valid = ShuffleSplit(n_splits=1, test_size=0.20, random_state=2)

results_logreg = cross_validate(
    estimator=grid_search_logreg,
    X=X,
    y=y,
    cv=outer_valid,
    return_estimator=True
)

print(f"accuracy: {results_logreg['test_score']}")
print(f"accuracy: {results_logreg['estimator'][0].best_params_}")


Fitting 5 folds for each of 150 candidates, totalling 750 fits


[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=6; total time=   0.1s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=6; total time=   0.1s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=6; total time=   0.1s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=7; total time=   0.0s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=6; total time=   0.1s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=7; total time=   0.0s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=8; total time=   0.0s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=8; total time=   0.0s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=8; total time=   0.0s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_features_to_select=6; total time=   0.1s
[CV] END logreg__C=0.1, logreg__l1_ratio=0.0, rfe__n_feature

In [45]:
mask_features = results_logreg["estimator"][0].best_estimator_.named_steps["rfe"].get_support()
cols = results_logreg["estimator"][0].best_estimator_.named_steps["cols_trans"].get_feature_names_out()

cols[mask_features]

array(['gender_Female', 'gender_Male', "education_Associate's Degree",
       'education_High School Diploma', "education_Master's Degree",
       'age', 'income'], dtype=object)

## K-nearest neighbors

In [46]:
knn = KNeighborsClassifier(n_neighbors=3)

knn_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("clf", knn)
    ]
)
cv_score = cross_val_score(knn_pipe, X, y, cv=5, scoring="accuracy")

print(cv_score.mean())

0.9753787878787878


## HistGradientBoostingClassifier


In [47]:
hgbc = HistGradientBoostingClassifier(categorical_features=categorical_cols)

param_grid = {
    "clf__learning_rate": [0.1, 0.2, 0.3],
    "clf__max_iter": [
        50,
        100,
        200,
    ],
    "clf__max_leaf_nodes": [5, 10, 15, 31],
    "clf__min_samples_leaf": [2, 5, 10, 20],
}
hgbc_pipe = Pipeline([("clf", hgbc)])

grid_search = GridSearchCV(
    hgbc_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results = cross_val_score(
    estimator=grid_search,
    X=X,
    y=y,
    cv=outer_valid
)

print(f"accuracy: {results}")

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=10; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=10; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=

# Results

The HistGradientBoostingClassifier seems to be slightly better than the other models in terms of accuracy, but since it's not a meaningful improvement, I chose the Logistic Regression because it's parametric and thus easier to interpret.