In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-default-risk/sample_submission.csv
/kaggle/input/home-credit-default-risk/bureau_balance.csv
/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv
/kaggle/input/home-credit-default-risk/application_train.csv
/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv
/kaggle/input/home-credit-default-risk/application_test.csv
/kaggle/input/home-credit-default-risk/previous_application.csv
/kaggle/input/home-credit-default-risk/credit_card_balance.csv
/kaggle/input/home-credit-default-risk/installments_payments.csv
/kaggle/input/home-credit-default-risk/bureau.csv


环境配置

In [2]:
pip install pandas matplotlib numpy scikit-learn seaborn

Note: you may need to restart the kernel to use updated packages.


步骤 1：数据加载与基础检查

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# 加载数据（替换为你的数据集路径）
train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')

# 关键检查：目标变量分布（确保1是少数类）
print("违约样本比例：")
print(train['TARGET'].value_counts(normalize=True))  # 正常应显示1占比5%-10%

# 分离特征和目标变量
X = train.drop('TARGET', axis=1)
y = train['TARGET']

违约样本比例：
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


步骤 2：数据预处理（修复关键问题）

In [4]:
# 1. 缺失值处理（按类型分别处理）
# 数值型特征：用中位数填充（抗异常值）
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# 类别型特征：用众数填充
cat_cols = X.select_dtypes(include=['object']).columns
X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

# 2. 异常值处理（截断极端值）
def cap_outliers(df, col):
    """用1%和99%分位数截断异常值"""
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)
    return df

for col in num_cols:
    X = cap_outliers(X, col)

# 3. 类别特征编码（LabelEncoder）
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

步骤 3：添加风控核心衍生特征（关键提升点）

In [5]:
def add_risk_features(df):
    # 1. 还款能力：信贷金额/收入（越高风险越大）
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] + 1e-5)
    
    # 2. 负债压力：年金/收入（月供占收入比例）
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / (df['AMT_INCOME_TOTAL'] + 1e-5)
    
    # 3. 信用历史长度：工作天数/年龄（负数转为正数）
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)  # 处理异常值
    df['EMPLOYED_AGE_RATIO'] = (-df['DAYS_EMPLOYED']) / (-df['DAYS_BIRTH'])  # 转为正数比例
    
    # 4. 家庭负担：孩子数量/家庭人数
    df['CHILDREN_RATIO'] = df['CNT_CHILDREN'] / (df['CNT_FAM_MEMBERS'] + 1)
    
    # 5. 资产稳定性：有车且车龄合理（车龄>0且<30年）
    df['VALID_CAR_AGE'] = ((df['OWN_CAR_AGE'] > 0) & (df['OWN_CAR_AGE'] < 30)).astype(int)
    
    return df

# 应用特征衍生
X = add_risk_features(X)

  df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] + 1e-5)
  df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / (df['AMT_INCOME_TOTAL'] + 1e-5)
  df['EMPLOYED_AGE_RATIO'] = (-df['DAYS_EMPLOYED']) / (-df['DAYS_BIRTH'])  # 转为正数比例
  df['CHILDREN_RATIO'] = df['CNT_CHILDREN'] / (df['CNT_FAM_MEMBERS'] + 1)
  df['VALID_CAR_AGE'] = ((df['OWN_CAR_AGE'] > 0) & (df['OWN_CAR_AGE'] < 30)).astype(int)


步骤 4：特征筛选（保留高预测力特征）

In [6]:
def calculate_iv(df, feature, target):
    """计算特征的信息价值（IV）"""
    binned = pd.qcut(df[feature], 10, duplicates='drop')  # 分10箱
    grouped = df.groupby(binned)[target].agg(['count', 'sum'])
    grouped['bad'] = grouped['sum']
    grouped['good'] = grouped['count'] - grouped['sum']
    grouped['bad_rate'] = grouped['bad'] / grouped['bad'].sum()
    grouped['good_rate'] = grouped['good'] / grouped['good'].sum()
    grouped['iv'] = (grouped['bad_rate'] - grouped['good_rate']) * np.log((grouped['bad_rate'] + 1e-10) / (grouped['good_rate'] + 1e-10))
    return grouped['iv'].sum()
# 重新定义特征筛选函数（保留衍生特征）
def filter_features_with_derived(df, target, derived_features, min_iv=0.02):
    # 计算所有特征的IV值
    iv_values = {}
    for col in df.columns:
        iv = calculate_iv(pd.concat([df[col], target], axis=1), col, 'TARGET')
        iv_values[col] = iv
    
    # 筛选IV>0.02的特征 + 强制保留衍生特征
    valid_features = [col for col, iv in iv_values.items() if iv > min_iv]
    valid_features = list(set(valid_features + derived_features))  # 合并并去重
    return valid_features, iv_values

# 明确指定衍生特征列表（与add_risk_features中定义的一致）
derived_features = [
    'CREDIT_INCOME_RATIO', 
    'ANNUITY_INCOME_RATIO', 
    'EMPLOYED_AGE_RATIO',
    'CHILDREN_RATIO',
    'VALID_CAR_AGE'
]

# 筛选特征（强制保留衍生特征）
valid_features, iv_values = filter_features_with_derived(X, y, derived_features)
X = X[valid_features]
print(f"筛选后有效特征数：{len(valid_features)}")
print(f"保留的衍生特征：{[f for f in derived_features if f in valid_features]}")  # 确认CREDIT_INCOME_RATIO在列

  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(b

筛选后有效特征数：40
保留的衍生特征：['CREDIT_INCOME_RATIO', 'ANNUITY_INCOME_RATIO', 'EMPLOYED_AGE_RATIO', 'CHILDREN_RATIO', 'VALID_CAR_AGE']


步骤 5：处理样本不平衡 + 训练模型（XGBoost）

In [7]:
# 划分训练集和测试集（分层抽样，保持违约比例）
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 计算正负样本比例（用于平衡权重）
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()  # 正常样本数/违约样本数

# 训练XGBoost（参数针对风控场景优化）
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # 二分类
    scale_pos_weight=scale_pos_weight,  # 平衡正负样本
    learning_rate=0.1,
    max_depth=4,  # 控制复杂度，避免过拟合
    n_estimators=200,  # 树的数量
    subsample=0.8,  # 样本采样，增加随机性
    colsample_bytree=0.8,  # 特征采样
    random_state=42
)

xgb_model.fit(X_train, y_train)

# 评估AUC
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"优化后测试集AUC：{auc:.4f}")  # 正常会达到0.75-0.85

优化后测试集AUC：0.7551


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, confusion_matrix

# 1. 新ROC曲线（优化后模型）
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {auc:.4f})', color='darkorange')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Optimized Model)')
plt.legend()
plt.savefig('/kaggle/working/optimized_roc_curve.png')
plt.close()

# 2. 特征重要性图（展示风控关键特征）
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
plt.title('Top 10 Important Features (Risk Factors)')
plt.tight_layout()  # 避免标签截断
plt.savefig('/kaggle/working/feature_importance.png')
plt.close()

# 3. 违约率与关键特征的关系（以信贷收入比为例）
plt.figure(figsize=(10, 6))
# 分箱展示违约率
X_test_with_target = X_test.copy()
X_test_with_target['TARGET'] = y_test
X_test_with_target['CREDIT_INCOME_BIN'] = pd.qcut(
    X_test_with_target['CREDIT_INCOME_RATIO'], 5, duplicates='drop'
)
bin_default_rate = X_test_with_target.groupby('CREDIT_INCOME_BIN')['TARGET'].mean()
bin_default_rate.plot(kind='bar')
plt.title('Default Rate by Credit/Income Ratio Bins')
plt.ylabel('Default Rate')
plt.tight_layout()
plt.savefig('/kaggle/working/default_rate_by_credit_income.png')
plt.close()

# 确认图片已生成
!ls /kaggle/working/  # 应显示3张新图片

  bin_default_rate = X_test_with_target.groupby('CREDIT_INCOME_BIN')['TARGET'].mean()


default_rate_by_credit_income.png  __notebook__.ipynb
feature_importance.png		   optimized_roc_curve.png
