In [None]:
import pandas as pd
from gplearn.genetic import SymbolicTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

df = pd.read_csv("kaggle/input/playground-series-s5e11/train.csv", index_col=0)
df["loan_paid_back"] = pd.to_numeric(df["loan_paid_back"], errors="coerce")

X = df.drop(columns=["loan_paid_back"]).select_dtypes(exclude="object")
y = df["loan_paid_back"]

In [None]:
def auc_metric(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return 0.5

In [None]:
gp_features = Pipeline(
    [
        ("Scaler", RobustScaler()),
        (
            "gp",
            SymbolicTransformer(
                generations=40,
                population_size=8000,
                hall_of_fame=400,
                n_components=20,
                function_set=(
                    "add",
                    "sub",
                    "mul",
                    "div",
                    "sqrt",
                    "log",
                    "sin",
                    "cos",
                    "abs",
                    "neg",
                    "tan",
                ),
                metric=auc_metric,  # encourage class separation, not correlation
                p_crossover=0.6,
                p_subtree_mutation=0.2,
                p_hoist_mutation=0.1,
                p_point_mutation=0.1,
                const_range=(-2, 2),
                parsimony_coefficient=0.002,
                feature_names=X.columns,
                random_state=42,
                n_jobs=-1,
                verbose=1,
            ),
        ),
    ]
)
gp_features.fit(X, y)
X_train_new = gp_features.transform(X)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    10.39        0.0551272        3         0.393631              N/A     15.73m
   1     6.75         0.213401        6         0.399619              N/A     11.70m
   2     6.58           0.2916       12         0.400981              N/A     10.54m
   3     5.71         0.328647       13         0.404494              N/A      9.35m
   4     6.54         0.326746       13         0.404494              N/A      8.89m
   5     6.47         0.323891        9          0.40553              N/A      8.17m
   6     7.45          0.32348       18         0.406301              N/A      7.82m
   7     8.22         0.314442       17         0.406301              N/A      7.32m
   8     8.35          0.30969        9          0.40553              N/A  

In [6]:
for i, program in enumerate(gp_features.named_steps["gp"]._best_programs[:10]):
    print(f"GP_{i}: {program}")


GP_0: mul(sub(debt_to_income_ratio, sin(credit_score)), cos(cos(debt_to_income_ratio)))
GP_1: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_2: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_3: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_4: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_5: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_6: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_7: mul(sub(debt_to_income_ratio, sin(credit_score)), cos(cos(debt_to_income_ratio)))
GP_8: mul(sub(sin(credit_score), debt_to_income_ratio), cos(cos(debt_to_income_ratio)))
GP_9: mul(sub(debt_to_income_ratio, sin(credit_score)), cos(cos(debt_to_income_ratio)))


In [7]:
corrs = X.corrwith(y)
print(corrs.sort_values(ascending=False).head(10))

credit_score            0.234560
annual_income           0.006326
loan_amount            -0.003762
interest_rate          -0.131184
debt_to_income_ratio   -0.335680
dtype: float64
