In [1]:
import pandas as pd
from gplearn.genetic import SymbolicTransformer

df = pd.read_csv("kaggle/input/playground-series-s5e11/train.csv", index_col=0)
df["loan_paid_back"] = pd.to_numeric(df["loan_paid_back"], errors="coerce")

X = df.drop(columns=["loan_paid_back"]).select_dtypes(exclude="object")
y = df["loan_paid_back"]

In [2]:
gp_features = SymbolicTransformer(
    generations=20,
    population_size=5000,
    hall_of_fame=500,
    n_components=15,   # number of new features to keep
    tournament_size=20,
    function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'abs'),
    parsimony_coefficient=0.0005,
    feature_names=X.columns,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
gp_features.fit(X, y)
X_train_new = gp_features.transform(X)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    10.29          0.07236        7         0.397683              N/A     38.17m
   1     6.06         0.267148        7         0.397683              N/A     29.61m
   2     6.70         0.295088        6         0.398573              N/A     28.53m
   3     7.62         0.292573       15         0.400075              N/A     28.34m
   4     8.02         0.296204       19         0.400076              N/A     26.92m
   5     7.06         0.305504       23         0.400076              N/A     23.74m
   6     6.01         0.302171        7         0.398778              N/A     21.49m
   7     5.51          0.30512        6         0.398778              N/A     18.75m
   8     5.05         0.336816        6         0.398778              N/A  

In [3]:
for i, program in enumerate(gp_features._best_programs[:10]):
    print(f"GP_{i}: {program}")


GP_0: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_1: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_2: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_3: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_4: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_5: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_6: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_7: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_8: div(sub(debt_to_income_ratio, -0.391), credit_score)
GP_9: div(sub(debt_to_income_ratio, -0.391), credit_score)
