In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import shap

In [2]:
app_train = pd.read_csv('../data/raw/application_tr.csv')
app_test = pd.read_csv('../data/raw/application_ts.csv')
bureau = pd.read_csv('../data/raw/bureau.csv')
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv')
previous = pd.read_csv('../data/raw/previous_application.csv')
pos_cash = pd.read_csv('../data/raw/POS_CASH_balance.csv')
credit_card = pd.read_csv('../data/raw/credit_card_balance.csv')
installments = pd.read_csv('../data/raw/installments_payments.csv')

In [3]:
bureau_full = bureau.merge(
    bureau_balance,
    on="SK_ID_BUREAU",
    how="left")


In [4]:
bureau_features = (
    bureau_full
    .groupby("SK_ID_CURR")
    .agg(
        bureau_loans=("SK_ID_BUREAU", "nunique"),
        bureau_days_credit_mean=("DAYS_CREDIT", "mean"),
        bureau_days_credit_min=("DAYS_CREDIT", "min"),
        bureau_credit_sum=("AMT_CREDIT_SUM", "sum"),
        bureau_credit_active=("CREDIT_ACTIVE", lambda x: (x == "Active").sum()),
        bureau_months_reported=("MONTHS_BALANCE", "count")
    )
    .reset_index()
)

bureau_features.to_parquet("../data/interim/bureau_features.parquet")


In [5]:

prev_features = (
    previous
    .groupby("SK_ID_CURR")
    .agg(
        prev_apps=("SK_ID_PREV", "nunique"),
        prev_amt_mean=("AMT_APPLICATION", "mean"),
        prev_amt_max=("AMT_APPLICATION", "max"),
        prev_refused=("NAME_CONTRACT_STATUS", lambda x: (x == "Refused").sum()),
        prev_approved=("NAME_CONTRACT_STATUS", lambda x: (x == "Approved").sum()),
        prev_days_decision_mean=("DAYS_DECISION", "mean")
    )
    .reset_index()
)

prev_features.to_parquet("../data/interim/prev_features.parquet")


In [6]:

installments["late"] = (
    installments["DAYS_ENTRY_PAYMENT"] > installments["DAYS_INSTALMENT"]
)

inst_features = (
    installments
    .groupby("SK_ID_CURR")
    .agg(
        inst_count=("SK_ID_PREV", "count"),
        inst_late_ratio=("late", "mean"),
        inst_payment_sum=("AMT_PAYMENT", "sum"),
        inst_instalment_sum=("AMT_INSTALMENT", "sum"),
        inst_payment_ratio=("AMT_PAYMENT", lambda x: x.sum())
    )
    .reset_index()
)

inst_features.to_parquet("../data/interim/installments_features.parquet")


In [7]:

pos_features = (
    pos_cash
    .groupby("SK_ID_CURR")
    .agg(
        pos_loans=("SK_ID_PREV", "nunique"),
        pos_months=("MONTHS_BALANCE", "count"),
        pos_dpd_mean=("SK_DPD", "mean"),
        pos_dpd_def_mean=("SK_DPD_DEF", "mean")
    )
    .reset_index()
)

pos_features.to_parquet("../data/interim/pos_features.parquet")


In [8]:

cc_features = (
    credit_card
    .groupby("SK_ID_CURR")
    .agg(
        cc_loans=("SK_ID_PREV", "nunique"),
        cc_balance_mean=("AMT_BALANCE", "mean"),
        cc_limit_mean=("AMT_CREDIT_LIMIT_ACTUAL", "mean"),
        cc_utilization=("AMT_BALANCE", "mean")
    )
    .reset_index()
)

cc_features.to_parquet("../data/interim/cc_features.parquet")


In [9]:
df_train = (
    app_train
    .merge(bureau_features, on="SK_ID_CURR", how="left")
    .merge(prev_features, on="SK_ID_CURR", how="left")
    .merge(inst_features, on="SK_ID_CURR", how="left")
    .merge(pos_features, on="SK_ID_CURR", how="left")
    .merge(cc_features, on="SK_ID_CURR", how="left")
)

df_train.to_parquet("../data/interim/train_final.parquet")


In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 147 entries, SK_ID_CURR to cc_utilization
dtypes: float64(90), int64(41), object(16)
memory usage: 344.9+ MB


In [11]:
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,inst_instalment_sum,inst_payment_ratio,pos_loans,pos_months,pos_dpd_mean,pos_dpd_def_mean,cc_loans,cc_balance_mean,cc_limit_mean,cc_utilization
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,219625.695,219625.695,1.0,19.0,0.0,0.0,,,,
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,1618864.65,1618864.65,3.0,28.0,0.0,0.0,,,,
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,21288.465,21288.465,1.0,4.0,0.0,0.0,,,,
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,1007153.415,1007153.415,3.0,21.0,0.0,0.0,1.0,0.0,270000.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,835985.34,806127.975,5.0,66.0,0.0,0.0,,,,


In [12]:
TARGET = "TARGET"
y = df_train[TARGET]
X = df_train.drop(columns=[TARGET])

In [13]:
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

print(f"Categóricas: {len(cat_cols)}")
print(f"Numéricas: {len(num_cols)}")

Categóricas: 16
Numéricas: 130


In [14]:
num_imputer = SimpleImputer(strategy="mean") #change median for mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [15]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [16]:
print(X.isna().sum().sum())  # must be 0
print(X.shape)

0
(307511, 260)


In [189]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42)

In [99]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val = pd.DataFrame(
    scaler.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)


In [190]:
'''
This library needs sklearn 1.3.2
and imblearn 0.11.0
'''

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline


# Combine over and under sampling
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.7)
steps = [('over', over), ('under', under)]
pipeline = ImbPipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)

In [191]:
X_train.shape

(164764, 260)

In [164]:

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

'''
#F-score (lineal)
selector_f = SelectKBest(score_func=f_classif, k=150)
X_train_f = selector_f.fit_transform(X_train, y_train)
'''

#Mutual Information (no lineal)
selector_mi = SelectKBest(score_func=mutual_info_classif, k=150)
X_train_mi = selector_mi.fit_transform(X_train, y_train)

#Intersection
#features_f = X_train.columns[selector_f.get_support()].tolist()
features_mi = X_train.columns[selector_mi.get_support()].tolist()
#features_common = list(set(features_f) & set(features_mi))


X_train = X_train[features_mi]
X_val = X_val[features_mi]

print(f"Selected features: {len(features_mi)}")

Selected features: 150


In [165]:
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()

scale = neg / pos

In [182]:

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',

    # core
    'learning_rate': 0.03,
    'n_estimators': 5500,
    'max_depth': 6,
    'min_child_weight': 20,

    # sampling
    'subsample': 0.8,
    'colsample_bytree': 0.8,

    # imbalance
    'scale_pos_weight': 1,

    # regularization
    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'gamma': 1.0,
}


xgb_model = XGBClassifier(**params, n_jobs=-1, random_state=42,  early_stopping_rounds=300)

In [183]:
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True)


[0]	validation_0-auc:0.62459
[1]	validation_0-auc:0.67602
[2]	validation_0-auc:0.68206
[3]	validation_0-auc:0.68703
[4]	validation_0-auc:0.68993
[5]	validation_0-auc:0.69405
[6]	validation_0-auc:0.69456
[7]	validation_0-auc:0.70023
[8]	validation_0-auc:0.70373
[9]	validation_0-auc:0.70418
[10]	validation_0-auc:0.70551
[11]	validation_0-auc:0.70572
[12]	validation_0-auc:0.70738
[13]	validation_0-auc:0.70774
[14]	validation_0-auc:0.70810
[15]	validation_0-auc:0.70922
[16]	validation_0-auc:0.71058
[17]	validation_0-auc:0.71184
[18]	validation_0-auc:0.71236
[19]	validation_0-auc:0.71252
[20]	validation_0-auc:0.71273
[21]	validation_0-auc:0.71372
[22]	validation_0-auc:0.71410
[23]	validation_0-auc:0.71499
[24]	validation_0-auc:0.71464
[25]	validation_0-auc:0.71613
[26]	validation_0-auc:0.71647
[27]	validation_0-auc:0.71755
[28]	validation_0-auc:0.71794
[29]	validation_0-auc:0.71894
[30]	validation_0-auc:0.71970
[31]	validation_0-auc:0.72034
[32]	validation_0-auc:0.72097
[33]	validation_0-au

In [112]:
'''
#search optim threshold
from sklearn.metrics import roc_curve
import numpy as np

y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)

# finds threshold max (TPR - FPR)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.3f}")
'''

'\n#search optim threshold\nfrom sklearn.metrics import roc_curve\nimport numpy as np\n\ny_val_proba = xgb_model.predict_proba(X_val)[:, 1]\nfpr, tpr, thresholds = roc_curve(y_val, y_val_proba)\n\n# finds threshold max (TPR - FPR)\noptimal_idx = np.argmax(tpr - fpr)\noptimal_threshold = thresholds[optimal_idx]\nprint(f"Optimal Threshold: {optimal_threshold:.3f}")\n'

In [184]:
#optimal f1 threshold
from sklearn.metrics import f1_score
import numpy as np

y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

thresholds = np.linspace(0.05, 0.6, 100)
f1_scores = [f1_score(y_val, y_val_proba >= t) for t in thresholds]

optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f"Optimal Threshold (F1): {optimal_threshold:.3f}")


Optimal Threshold (F1): 0.339


In [185]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve

y_val_pred = xgb_model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_val_pred)

adj_threshold = optimal_threshold + 0.02

y_val_pred = (y_val_pred >= adj_threshold).astype(int)

classification_report = classification_report(y_val, y_val_pred)

print(f"AUC validación: {auc:.5f}")
print(classification_report)

AUC validación: 0.77639
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     56538
           1       0.30      0.37      0.33      4965

    accuracy                           0.88     61503
   macro avg       0.62      0.65      0.63     61503
weighted avg       0.89      0.88      0.88     61503

