In [4]:
import pandas as pd
from gplearn.genetic import SymbolicTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv", index_col=0)
df["loan_paid_back"] = pd.to_numeric(df["loan_paid_back"], errors="coerce")

X = df.drop(columns=["loan_paid_back"]).select_dtypes(exclude="object")
y = df["loan_paid_back"]

In [5]:
def auc_metric(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return 0.5

In [7]:
gp_features = Pipeline(
    [
        ("Scaler", RobustScaler()),
        (
            "gp",
            SymbolicTransformer(
                generations=40,
                population_size=8000,
                hall_of_fame=400,
                n_components=20,
                function_set=(
                    "add",
                    "sub",
                    "mul",
                    "div",
                    "sqrt",
                    "log",
                    "sin",
                    "cos",
                    "abs",
                    "neg",
                    "tan",
                ),
                metric='spearman',  # encourage class separation, not correlation
                p_crossover=0.6,
                p_subtree_mutation=0.2,
                p_hoist_mutation=0.1,
                p_point_mutation=0.1,
                const_range=(-2, 2),
                parsimony_coefficient=0.002,
                feature_names=X.columns,
                random_state=42,
                n_jobs=-1,
                verbose=1,
            ),
        ),
    ]
)
gp_features.fit(X, y)
X_train_new = gp_features.transform(X)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


KeyboardInterrupt: 

In [None]:
for i, program in enumerate(gp_features.named_steps["gp"]._best_programs[:10]):
    print(f"GP_{i}: {program}")


In [None]:
corrs = X.corrwith(y)
print(corrs.sort_values(ascending=False).head(10))