In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from i import svg_dir, data_dir, model_dir, submissions_dir, model_pred_train_dir, model_pred_test_dir
from i import FeaturesToNumericalPipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline

from gplearn.genetic import SymbolicTransformer, SymbolicClassifier

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# gpt = joblib.load(model_dir + "gpt.joblib")
# gpc = joblib.load(model_dir + "gpc.joblib")

train = pd.read_csv(data_dir + "train.csv", index_col=0)
test = pd.read_csv(data_dir + "test.csv", index_col=0)

for col in train.select_dtypes(include="object").columns:
            train[col] = train[col].astype("category")
for col in test.select_dtypes(include="object").columns:
            test[col] = test[col].astype("category")

y = train.iloc[:, -1]
X = train.iloc[:, :-1]
X_pred = test

In [None]:
X_heat = FeaturesToNumericalPipeline.fit_transform(X, y)
corr = X_heat.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.9)]
X_reduced = X_heat.drop(columns=to_drop)
corr_target = X_reduced.corrwith(y).abs()
selected_columns = corr_target[(corr_target > 0.05) & (corr_target < 0.6)].index
X_gp = X_reduced[selected_columns]
corr2 = X_gp.corr().abs()

fig, ax = plt.subplots(2, 1, figsize=(15, 10))
sns.heatmap(corr, cmap="icefire", annot=True, fmt="0.2f", ax=ax[0])
sns.heatmap(corr2, cmap="icefire", annot=True, fmt="0.2f", ax=ax[1])
fig.savefig(svg_dir + "heatmap.svg")
plt.show()

In [None]:
gpt = Pipeline(
    [
        ("features", FeaturesToNumericalPipeline.set_params(selector__columns=selected_columns)),
        (
            "gpt",
            SymbolicTransformer(
                generations=20,  # number of evolution cycles
                population_size=2000,  # number of candidate formulas
                hall_of_fame=100,  # keep best 50 formulas
                n_components=10,  # how many new features to produce
                function_set=("add", "sub", "mul", "div", "sqrt", "log", "abs", "sin", "cos"),
                feature_names=selected_columns,
                parsimony_coefficient=0.00001,  # penalty for overly complex formulas
                max_samples=0.9,
                random_state=42,
                verbose=1,
                n_jobs=-1,
            ),
        ),
    ]
)

gpt.fit(X, y)
joblib.dump(gpt, model_dir + "gpt.joblib")

In [None]:
gpc = Pipeline(
    [
        ("features", FeaturesToNumericalPipeline.set_params(selector__columns=selected_columns)),
        (
            "gpc",
            SymbolicClassifier(
                generations=20,  # number of evolution cycles
                population_size=2000,  # number of candidate formulas
                function_set=("add", "sub", "mul", "div", "sqrt", "log", "abs", "sin", "cos"),
                feature_names=selected_columns,
                parsimony_coefficient=0.00001,  # penalty for overly complex formulas
                max_samples=0.9,
                random_state=42,
                verbose=1,
                n_jobs=-1,
            ),
        ),
    ]
)

score_gpc = cross_val_score(gpc, X_gp, y, cv=cv, scoring="roc_auc")
print(f"gpc scores across folds: {score_gpc} - Mean Score: {score_gpc.mean():0.4f}")

gpc.fit(X, y)
joblib.dump(gpc, model_dir + "gpc.joblib")

In [None]:
prediction = pd.DataFrame(
  data={
    "loan_paid_back": gpc.predict(X_pred)
  },
  index=X_pred.index
)
probability_train = pd.DataFrame(
  data={
    "loan_paid_back_probability_GP": cross_val_predict(gpc, X, y, cv=cv, method="predict_proba")[:, 1]
  },
  index=X_pred.index
)
probability_test = pd.DataFrame(
  data={
    "loan_paid_back_probability_GP": gpc.predict_proba(X_pred)
  },
  index=X_pred.index
)

prediction.to_parquet(submissions_dir + "submission_gp.parquet")
probability_train.to_parquet(model_pred_train_dir + "train_prob_gp.parquet")
probability_test.to_parquet(model_pred_test_dir + "test_prob_gp.parquet")