In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from feature_engine.encoding import WoEEncoder
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.pipeline import Pipeline
from category_encoders.woe import WOEEncoder
import numpy as np

In [None]:
# Load data
df = pd.read_csv("./hcdr_application_train.csv")

In [118]:
def fill_nan(df):
    for col in df.columns:
        if df[col].isna().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                # 数值列用中位数填充
                median = df[col].median()
                df[col] = df[col].fillna(median)
            else:
                # 类别列用'unknown'填充
                df[col] = df[col].fillna('unknown')
    return df

In [119]:
df_clean = fill_nan(df)

In [120]:
df_clean

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
y = df_clean["TARGET"]
X = df_clean.drop(columns=["TARGET", "SK_ID_CURR"])
#X = X.drop(columns=['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS'])

In [122]:
print(X.dtypes)


NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
CNT_CHILDREN                    int64
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 120, dtype: object


In [123]:
# Variable split
'''
categorical_vars = X.select_dtypes(include=["object", "int64"]).columns.tolist()
numerical_vars = X.select_dtypes(include="float64").columns.tolist()

print("---Categorical Features---")
print(categorical_vars)
print(len(categorical_vars))

print("---Numerial Features---")
print(numerical_vars)
print(len(numerical_vars))
'''

categorical_vars = X.select_dtypes(include="object").columns.tolist()
# 加入部分二值 int 列
categorical_vars += [c for c in X.select_dtypes(include="int64").columns if X[c].nunique() < 10]
categorical_vars = list(set(categorical_vars))

numerical_vars = [c for c in X.columns if c not in categorical_vars]
X[categorical_vars] = X[categorical_vars].astype("category")

single_vars = [c for c in categorical_vars if X[c].nunique() <= 1]
print("! eq: dropping single value vars:", single_vars)
X = X.drop(columns=single_vars)


! eq: dropping single value vars: []


In [124]:
print(len(categorical_vars))
print(len(numerical_vars))

50
70


In [125]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [126]:
# WoE pipeline
pipe_woe = Pipeline([
    ("cat_imputer", CategoricalImputer(imputation_method='missing', variables=categorical_vars)),
    ("num_imputer", MeanMedianImputer(imputation_method='median', variables=numerical_vars)),
    ("discretiser", EqualFrequencyDiscretiser(q=5, variables=numerical_vars, return_object=True)),
    # ("woe_encoder", WoEEncoder(variables=categorical_vars + numerical_vars, ignore_format=True)),
    ("woe_encoder", WOEEncoder(cols=categorical_vars + numerical_vars, regularization=0.1)),
    ("model", LogisticRegression(max_iter=10, solver='saga', penalty='l2', class_weight='balanced', random_state=0))
])

In [127]:
pipe_woe.fit(X_train, y_train)
y_pred_woe = pipe_woe.predict(X_test)
y_prob_woe = pipe_woe.predict_proba(X_test)[:, 1]

print("=== With WoE Encoding ===")
print(classification_report(y_test, y_pred_woe))
print(f"ROC AUC: {roc_auc_score(y_test, y_prob_woe):.4f}")



=== With WoE Encoding ===
              precision    recall  f1-score   support

           0       0.96      0.66      0.79     84806
           1       0.15      0.68      0.25      7448

    accuracy                           0.67     92254
   macro avg       0.56      0.67      0.52     92254
weighted avg       0.89      0.67      0.74     92254

ROC AUC: 0.7370


In [128]:
# without WoE, using one-hot for categorical features
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat = pd.DataFrame(ohe.fit_transform(X[categorical_vars]), 
                     columns=ohe.get_feature_names_out(categorical_vars),
                     index=X.index)

# 保留数值变量
X_num = X[numerical_vars]

# 拼接完整特征
X_full = pd.concat([X_num, X_cat], axis=1)

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.3, random_state=42, stratify=y)

In [130]:
X_train

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_unknown,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y
159703,2,405000.0,1971072.0,68643.0,1800000.0,0.006852,-13587,-1028,-7460.0,-1823,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
79269,0,337500.0,508495.5,38146.5,454500.0,0.010276,-17543,-1208,-4054.0,-1090,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
232615,1,112500.0,110146.5,13068.0,90000.0,0.005084,-11557,-593,-5554.0,-4130,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
33420,2,40500.0,66384.0,3519.0,45000.0,0.031329,-15750,-5376,-5285.0,-5290,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
88191,0,225000.0,298512.0,31801.5,270000.0,0.019101,-19912,-1195,-86.0,-3033,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170183,1,157500.0,846517.5,33700.5,684000.0,0.035792,-14742,-7799,-5732.0,-4088,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
31304,1,135000.0,405000.0,20250.0,405000.0,0.035792,-15374,-595,-6831.0,-4420,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
121193,0,157500.0,272520.0,21528.0,225000.0,0.018801,-19035,-4334,-8490.0,-2561,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
248504,0,90000.0,246357.0,24493.5,234000.0,0.025164,-23088,365243,-8975.0,-4636,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [None]:
lr = LogisticRegression(max_iter=50, solver='saga', penalty='l2', class_weight='balanced', random_state=0)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.61      0.74     84806
           1       0.11      0.53      0.18      7448

    accuracy                           0.60     92254
   macro avg       0.52      0.57      0.46     92254
weighted avg       0.87      0.60      0.69     92254

ROC AUC Score: 0.590301731305972


In [None]:
# using lightGBM
from lightgbm import LGBMClassifier

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

cat_features = X.select_dtypes(include=["object"]).columns.tolist()

for col in cat_features:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")

model = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    learning_rate=0.05,
    n_estimators=100,
    max_depth=5,          # 树的最大深度
    num_leaves=31, 
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# 训练模型，支持 early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    categorical_feature=cat_features
)


In [None]:
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

print("Classification Report:")
print(classification_report(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_prob))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84806
           1       0.78      0.01      0.01      7448

    accuracy                           0.92     92254
   macro avg       0.85      0.50      0.49     92254
weighted avg       0.91      0.92      0.88     92254

ROC AUC Score: 0.7518204253086079
