In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

df = pd.read_csv("kaggle/working/FeatureFrame.csv", index_col=0)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
corr = df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.9)]
X_reduced = X.drop(columns=to_drop)
corr_target = X_reduced.corrwith(y).abs()
selected = corr_target[(corr_target > 0.05) & (corr_target < 0.6)].index
X_gp = X_reduced[selected]
corr2 = X_gp.corr().abs()

fig, ax = plt.subplots(2, 1, figsize=(15, 10))
sns.heatmap(corr, cmap="icefire", annot=True, fmt="0.2f", ax=ax[0])
sns.heatmap(corr2, cmap="icefire", annot=True, fmt="0.2f", ax=ax[1])
plt.show()

In [None]:
gpt = SymbolicTransformer(
    generations=20,  # number of evolution cycles
    population_size=2000,  # number of candidate formulas
    hall_of_fame=100,  # keep best 50 formulas
    n_components=10,  # how many new features to produce
    function_set=("add", "sub", "mul", "div", "sqrt", "log", "abs", "sin", "cos"),
    feature_names=X_gp.columns,
    parsimony_coefficient=0.00001,  # penalty for overly complex formulas
    max_samples=0.9,
    random_state=42,
    verbose=1,
    n_jobs=-1,
)

gpt.fit(X_gp, y)

In [None]:
X_train_new = gpt.transform(X_gp)
for i, program in enumerate(gpt._best_programs):
    print(f"GP_{i}: {program}")


In [None]:
import joblib
joblib.dump(gpt, "kaggle/working/best/" + "gpt.joblib")


In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

gpc = SymbolicClassifier(
    generations=20,  # number of evolution cycles
    population_size=2000,  # number of candidate formulas
    function_set=("add", "sub", "mul", "div", "sqrt", "log", "abs", "sin", "cos"),
    feature_names=X_gp.columns,
    parsimony_coefficient=0.00001,  # penalty for overly complex formulas
    max_samples=0.9,
    random_state=42,
    verbose=1,
    n_jobs=-1,
)

score_gpc = cross_val_score(gpc, X_gp, y, cv=cv, scoring="roc_auc")
print(
    f"xgb scores across folds: {score_gpc} - Mean Score: {score_gpc.mean():0.4f}"
)