In [2]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from i import xgb_params_search, cat_params_search, lgb_params_search, hgb_params_search
from i import Features, columnTransformerSelector
from i import model_dir, data_dir, params_dir, model_pred_train_dir, model_pred_test_dir

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# Disable warnings
import warnings

warnings.filterwarnings("ignore")

# Palette
sns.set_palette("icefire")

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

train = pd.read_csv(data_dir + "train.csv", index_col=0)
test = pd.read_csv(data_dir + "test.csv", index_col=0)

for col in train.select_dtypes(include="object").columns:
    train[col] = train[col].astype("category")
for col in test.select_dtypes(include="object").columns:
    test[col] = test[col].astype("category")

X = train.iloc[:, :-1]
y = train.iloc[:, -1]
X_pred = test

featureTransformer = Features()
FeatureFrame = featureTransformer.fit_transform(X, y)
TestFeaFrame = featureTransformer.transform(X_pred)
catFeatures = FeatureFrame.select_dtypes(include="category").columns.to_list()
numFeatures = FeatureFrame.select_dtypes(exclude="category").columns.to_list()

pd.concat([FeatureFrame, y], axis=1).to_parquet(data_dir + "train_feature.parquet")
TestFeaFrame.to_parquet(data_dir + "test_feature.parquet")

FeatureFrame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 593994 entries, 0 to 593993
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   annual_income               593994 non-null  float64 
 1   debt_to_income_ratio        593994 non-null  float64 
 2   credit_score                593994 non-null  int64   
 3   loan_amount                 593994 non-null  float64 
 4   interest_rate               593994 non-null  float64 
 5   employment_status           593994 non-null  category
 6   grade_subgrade              593994 non-null  category
 7   annual_incomeXcredit_score  593994 non-null  float64 
 8   loan_to_income              593994 non-null  float64 
 9   interest_burden             593994 non-null  float64 
 10  log_income                  593994 non-null  float64 
 11  log_loan_amount             593994 non-null  float64 
 12  debt_to_income_ratio_diff   593994 non-null  float64 
 13  debt

# Load and Overview


In [None]:
print(
    "Missing Train: ",
    train.isnull().sum().sum(),
    " - Missing Test: ",
    test.isnull().sum().sum(),
    end="\n\n",
)
train.head(3)

In [None]:
train.describe(include="all")

In [None]:
train.info()

# Distribution

In [None]:
for col in train.select_dtypes(include="category").columns.to_list() + ["loan_paid_back"]:
    sns.countplot(data=train, x=train[col], palette="icefire")
    plt.xticks(rotation=45, ha="right")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
xgb = Pipeline([
    ("feature", Features()),
    ("xgb",XGBClassifier(
        random_state=42,
        n_estimators=1500,
        learning_rate=0.03,
        enable_categorical=True,
        verbose=False,
        device="cuda",
        n_job=7,
    )),
])

cat = Pipeline([
    ("feature", Features()),
    ("cat", CatBoostClassifier(
        random_state=42,
        n_estimators=1500,
        learning_rate=0.03,
        cat_features=catFeatures,
        verbose=False,
        task_type="GPU",
        thread_count=7,
    )),
])

lgb = Pipeline([
    ("feature", Features()),
    ("lgb", LGBMClassifier(
        random_state=42, 
        n_estimators=1500, 
        learning_rate=0.03, 
        verbose=-1,
        device="gpu",
        n_job=7,
    )),
])

hgb = Pipeline([
    ("feature", Features()),
    ("transformer", columnTransformerSelector),
    ("hgb", HistGradientBoostingClassifier(
        random_state=42, 
        max_iter=500, 
        learning_rate=0.03, 
        verbose=0
    )),
])

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_params_search,
    n_iter=25,
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    n_jobs=7,
).fit(X, y)

cat_search = RandomizedSearchCV(
    cat,
    param_distributions=cat_params_search,
    n_iter=25,
    scoring="roc_auc",
    cv=cv,
    verbose=7,
).fit(X, y)

lgb_search = RandomizedSearchCV(
    lgb,
    param_distributions=lgb_params_search,
    n_iter=25,
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    n_jobs=7,
).fit(X, y)

hgb_search = RandomizedSearchCV(
    hgb,
    param_distributions=hgb_params_search,
    n_iter=25,
    scoring="roc_auc",
    cv=cv,
    verbose=1,
    n_jobs=7,
).fit(X, y)

pd.Series(xgb_search.best_params_).to_csv(params_dir + "xg_params.csv")
pd.Series(cat_search.best_params_).to_csv(params_dir + "ca_params.csv")
pd.Series(lgb_search.best_params_).to_csv(params_dir + "lg_params.csv")
pd.Series(hgb_search.best_params_).to_csv(params_dir + "hg_params.csv")
xgb.set_params(**pd.read_csv(params_dir + "xgb_params.csv", index_col=0).squeeze().to_dict())
cat.set_params(**pd.read_csv(params_dir + "cat_params.csv", index_col=0).squeeze().to_dict())
lgb.set_params(**pd.read_csv(params_dir + "lgb_params.csv", index_col=0).squeeze().to_dict())
hgb.set_params(**pd.read_csv(params_dir + "hgb_params.csv", index_col=0).squeeze().to_dict())


In [None]:

xgb.fit(X, y)
cat.fit(X, y)
lgb.fit(X, y)
hgb.fit(X, y)
joblib.dump(xgb, model_dir + "xgb.joblib")
joblib.dump(cat, model_dir + "cat.joblib")
joblib.dump(lgb, model_dir + "lgb.joblib")
joblib.dump(hgb, model_dir + "hgb.joblib")

score_xgb = cross_val_score(xgb, X, y, cv=cv, scoring="roc_auc")
print(f"xgb scores across folds: {score_xgb} - Mean Score: {score_xgb.mean():0.4f}")

score_cat = cross_val_score(cat, X, y, cv=cv, scoring="roc_auc")
print(f"cat scores across folds: {score_cat} - Mean Score: {score_cat.mean():0.4f}")

score_lgb = cross_val_score(lgb, X, y, cv=cv, scoring="roc_auc")
print(f"lgb scores across folds: {score_lgb} - Mean Score: {score_lgb.mean():0.4f}")
score_hgb = cross_val_score(hgb, X, y, cv=cv, scoring="roc_auc")
print(f"hgb scores across folds: {score_hgb} - Mean Score: {score_hgb.mean():0.4f}")


print(f"Overall Performance: {np.mean([score_xgb, score_cat, score_lgb, score_hgb], axis=0)}")

total = score_xgb.mean() + score_cat.mean() + score_lgb.mean() + score_hgb.mean()
xgb_weight = score_xgb.mean() / total
cat_weight = score_cat.mean() / total
lgb_weight = score_lgb.mean() / total
hgb_weight = score_hgb.mean() / total
print(xgb_weight, cat_weight, lgb_weight, hgb_weight)

importantFeatures = pd.DataFrame(
    data={
        "xgb": xgb.named_steps["xgb"].feature_importances_ / xgb.named_steps["xgb"].feature_importances_.max(),
        "cat": cat.named_steps["cat"].feature_importances_ / cat.named_steps["cat"].feature_importances_.max(),
        "lgb": lgb.named_steps["lgb"].feature_importances_ / lgb.named_steps["lgb"].feature_importances_.max(),
    },
    index = FeatureFrame.columns
)
importantFeatures["Overall"] = importantFeatures.mean(axis=1)
importantFeatures = (importantFeatures.sort_values("Overall", ascending=False) * 100).round(2)
importantFeatures

In [None]:
propability_train = pd.DataFrame(
    data={
        "xgb": cross_val_predict(xgb, X, y, cv=cv, method="predict_proba")[:, 1],
        "cat": cross_val_predict(cat, X, y, cv=cv, method="predict_proba")[:, 1],
        "lgb": cross_val_predict(lgb, X, y, cv=cv, method="predict_proba")[:, 1],
        "hgb": cross_val_predict(hgb, X, y, cv=cv, method="predict_proba")[:, 1],
    },
    index=FeatureFrame.index,
)

propability_test = pd.DataFrame(
    data={
        "xgb": xgb.predict_proba(X_pred)[:, 1],
        "cat": cat.predict_proba(X_pred)[:, 1],
        "lgb": lgb.predict_proba(X_pred)[:, 1],
        "hgb": hgb.predict_proba(X_pred)[:, 1],
    },
    index=X_pred.index,
)

propability_train.to_parquet(model_pred_train_dir + "train_prob_tree.parquet")
propability_test.to_parquet(model_pred_test_dir + "test_prob_tree.parquet")