In [8]:
import pandas as pd
import numpy as np

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier



In [10]:
credit_df = pd.read_csv("credit_score_dataset_1.csv")


y = credit_df["Credit Score"]
X = credit_df.drop("Credit Score", axis=1)

y.unique()

array(['High', 'Average', 'Low'], dtype=object)

# Processing

## OneHotEnconding

In [11]:
categorical_cols = X.select_dtypes("object").columns

col_trans = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(), categorical_cols),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)


## Standard Scaler

In [12]:
scaler = StandardScaler()

# Models

## Logistic Regression

In [13]:
logreg = LogisticRegression(penalty="l1", dual=False, max_iter=10000, solver="liblinear", C=2)

logreg_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("linear_svc", logreg)
    ]
)
cv_score = cross_val_score(logreg_pipe, X, y, cv=5, scoring="accuracy", )

print(cv_score.mean())


0.9753787878787878


In [14]:
logreg_pipe.fit(X, y)

ohe_feature_names = logreg_pipe.named_steps['cols_trans'].named_transformers_['ohe'].get_feature_names_out()
all_feature_names = np.concatenate([ohe_feature_names, ['Value', 'value2', 'value3']])

coefficients = logreg_pipe.named_steps['linear_svc'].coef_

for class_idx, class_coefficients in enumerate(coefficients):
    removed_columns = all_feature_names[class_coefficients == 0]
    print(f"Removed columns for class {class_idx} by L1 regularization: {removed_columns}")

Removed columns for class 0 by L1 regularization: ['Education_Doctorate' 'Education_High School Diploma'
 'Marital Status_Married' 'Marital Status_Single' 'value3']
Removed columns for class 1 by L1 regularization: ["Education_Bachelor's Degree" 'Education_Doctorate'
 'Marital Status_Single' 'Home Ownership_Owned' 'Home Ownership_Rented']
Removed columns for class 2 by L1 regularization: ["Education_Associate's Degree" 'Education_Doctorate'
 "Education_Master's Degree" 'Marital Status_Married'
 'Marital Status_Single' 'Home Ownership_Owned' 'Home Ownership_Rented'
 'Value' 'value3']


## K-nearest neighbors

In [16]:
knn = KNeighborsClassifier(n_neighbors=3)

knn_pipe = Pipeline(
    [
        ("cols_trans", col_trans),
        ("scaler", scaler),
        ("clf", knn)
    ]
)
cv_score = cross_val_score(knn_pipe, X, y, cv=5, scoring="accuracy")

print(cv_score.mean())

0.9753787878787878


## HistGradientBoostingClassifier


In [26]:
hgbc = HistGradientBoostingClassifier(categorical_features=categorical_cols)

param_grid = {
    "clf__learning_rate": [0.1, 0.2, 0.3],
    "clf__max_iter": [
        50,
        100,
        200,
    ],
    "clf__max_leaf_nodes": [5, 10, 15, 31],
    "clf__min_samples_leaf": [2, 5, 10, 20],
}
hgbc_pipe = Pipeline([("clf", hgbc)])

grid_search = GridSearchCV(
    hgbc_pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)

outer_valid = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

results = cross_val_score(
    estimator=grid_search,
    X=X,
    y=y,
    cv=outer_valid
)

print(f"accuracy: {results}")

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=5; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=2; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_iter=50, clf__max_leaf_nodes=5, clf__min_samples_leaf=10; total time=

# Results

The HistGradientBoostingClassifier seems to be slightly better than the other models in terms of accuracy, but since it's not a meaningful improvement, I chose the Logistic Regression because it's parametric and thus easier to interpret.