In [1]:
!pip install gplearn

Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: gplearn
Successfully installed gplearn-0.4.2


In [2]:
import pandas as pd
from gplearn.genetic import SymbolicTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv", index_col=0)
df["loan_paid_back"] = pd.to_numeric(df["loan_paid_back"], errors="coerce")

X = df.drop(columns=["loan_paid_back"]).select_dtypes(exclude="object")
y = df["loan_paid_back"]

In [3]:
def auc_metric(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return 0.5

In [4]:
gp_features = Pipeline(
    [
        ("Scaler", RobustScaler()),
        (
            "gp",
            SymbolicTransformer(
                generations=40,
                population_size=8000,
                hall_of_fame=400,
                n_components=20,
                function_set=(
                    "add",
                    "sub",
                    "mul",
                    "div",
                    "sqrt",
                    "log",
                    "sin",
                    "cos",
                    "abs",
                    "neg",
                    "tan",
                ),
                metric='spearman',  # encourage class separation, not correlation
                p_crossover=0.6,
                p_subtree_mutation=0.2,
                p_hoist_mutation=0.1,
                p_point_mutation=0.1,
                const_range=(-2, 2),
                parsimony_coefficient=0.002,
                feature_names=X.columns,
                random_state=42,
                n_jobs=-1,
                verbose=1,
            ),
        ),
    ]
)
gp_features.fit(X, y)
X_train_new = gp_features.transform(X)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     9.06        0.0579907        7         0.378191              N/A    366.39m
   1     5.58         0.176159        7         0.380079              N/A    273.14m
   2     4.60          0.24237        9         0.380871              N/A    254.42m
   3     4.48         0.275032       11         0.381121              N/A    259.83m
   4     3.26         0.298318        4         0.380044              N/A    248.68m
   5     3.26         0.294587        5         0.378191              N/A    242.40m
   6     3.28         0.295799        4         0.380044              N/A    234.67m
   7     3.22         0.297072        4         0.380044              N/A    223.61m
   8     3.22         0.296371        4         0.380044              N/A  

In [5]:
for i, program in enumerate(gp_features.named_steps["gp"]._best_programs[:10]):
    print(f"GP_{i}: {program}")


GP_0: sub(debt_to_income_ratio, sin(credit_score))
GP_1: sub(sub(credit_score, debt_to_income_ratio), neg(-1.793))
GP_2: mul(neg(sub(credit_score, debt_to_income_ratio)), 0.726)
GP_3: sub(debt_to_income_ratio, credit_score)
GP_4: sub(debt_to_income_ratio, credit_score)
GP_5: sub(credit_score, debt_to_income_ratio)
GP_6: sub(credit_score, debt_to_income_ratio)
GP_7: sub(debt_to_income_ratio, credit_score)
GP_8: sub(debt_to_income_ratio, credit_score)
GP_9: sub(debt_to_income_ratio, credit_score)


In [6]:
corrs = X.corrwith(y)
print(corrs.sort_values(ascending=False).head(10))

credit_score            0.234560
annual_income           0.006326
loan_amount            -0.003762
interest_rate          -0.131184
debt_to_income_ratio   -0.335680
dtype: float64
