In [None]:
import pandas as pd

df = pd.read_csv("kaggle/input/playground-series-s5e11/train.csv")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Ensure target is numeric (GP needs numeric y)
df["loan_paid_back"] = pd.to_numeric(df["loan_paid_back"], errors="coerce")

X = df.drop(columns=["loan_paid_back"])
y = df["loan_paid_back"]

# If you still have categorical columns, encode or drop them:
X = X.select_dtypes(include=["int64", "float64"]).copy()

# ---- 2. Patch gplearn for new scikit-learn versions ----
import gplearn.genetic
from sklearn.utils.validation import check_X_y

def _validate_data(self, X, y, y_numeric=True):
    X, y = check_X_y(X, y, y_numeric=y_numeric)
    return X, y

gplearn.genetic.BaseSymbolic._validate_data = _validate_data

# ---- 3. Split data ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- 4. Define Symbolic Regressor ----
from gplearn.genetic import SymbolicClassifier

gp = SymbolicClassifier(
    population_size=2000,
    generations=20,
    tournament_size=20,
    stopping_criteria=0.01,
    const_range=(-1, 1),
    init_depth=(2, 6),
    function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'sin', 'cos', 'tan'),
    parsimony_coefficient=0.001,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# ---- 5. Fit and evaluate ----
gp.fit(X_train, y_train)


# Patch missing attribute for sklearn >=1.3 compatibility
if not hasattr(gp, "n_features_in_"):
    gp.n_features_in_ = X_train.shape[1]
    
y_pred = gp.predict(X_test)
print("R2:", r2_score(y_test, y_pred))
print("Best expression:", gp._program)


KeyboardInterrupt: 

In [None]:
print("Accuracy:", (y_pred == y_test).mean())

In [None]:
import pandas as pd
from gplearn.genetic import SymbolicTransformer
from sklearn.model_selection import train_test_split

gp_features = SymbolicTransformer(
    generations=1,
    population_size=3000,
    hall_of_fame=100,
    n_components=15,   # number of new features to keep
    function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'abs'),
    parsimony_coefficient=0.0005,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# --- Fit the transformer on training data ---
gp_features.fit(X_train, y_train)

# Patch missing attribute for sklearn >=1.3 compatibility
if not hasattr(gp_features, "n_features_in_"):
    gp_features.n_features_in_ = X_train.shape[1]

# --- Transform data to get new columns ---
X_train_new = gp_features.transform(X_train)
X_test_new = gp_features.transform(X_test)

In [None]:
for i, program in enumerate(gp_features._best_programs[:10]):
    print(f"GP_{i}: {program}")
