In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-default-risk/sample_submission.csv
/kaggle/input/home-credit-default-risk/bureau_balance.csv
/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv
/kaggle/input/home-credit-default-risk/application_train.csv
/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv
/kaggle/input/home-credit-default-risk/application_test.csv
/kaggle/input/home-credit-default-risk/previous_application.csv
/kaggle/input/home-credit-default-risk/credit_card_balance.csv
/kaggle/input/home-credit-default-risk/installments_payments.csv
/kaggle/input/home-credit-default-risk/bureau.csv


环境配置

In [2]:
pip install pandas matplotlib numpy scikit-learn seaborn

Note: you may need to restart the kernel to use updated packages.


数据加载与初步清洗（Pandas 核心操作）

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据（以训练集为例）
train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')  # 主表：借款人基本信息
print(f"数据集形状：{train.shape}")  # 查看行数和列数
print(train['TARGET'].value_counts(normalize=True))  # 查看违约比例（0=未违约，1=违约）

# 缺失值处理（金融数据常见操作）
missing_ratio = train.isnull().mean().sort_values(ascending=False)
# 可视化缺失值前20列
plt.figure(figsize=(12, 8))
missing_ratio[:20].plot(kind='barh')
plt.title('Missing Value Ratio (Top 20 Columns)')
plt.xlabel('Ratio')
plt.savefig('missing_values.png')  # 保存图片用于GitHub
plt.close()

# 处理策略：数值型用中位数填充，类别型用众数填充
num_cols = train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = train.select_dtypes(include=['object']).columns

train[num_cols] = train[num_cols].fillna(train[num_cols].median())
train[cat_cols] = train[cat_cols].fillna(train[cat_cols].mode().iloc[0])

数据集形状：(307511, 122)
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


探索性数据分析（EDA）与可视化

In [4]:
# 1. 目标变量分布（查看样本是否不平衡）
plt.figure(figsize=(6, 4))
sns.countplot(x='TARGET', data=train)
plt.title('Distribution of Default (1=Default)')
plt.savefig('target_distribution.png')
plt.close()

# 2. 数值特征与违约的关系（以收入为例）
plt.figure(figsize=(10, 6))
sns.boxplot(x='TARGET', y='AMT_INCOME_TOTAL', data=train)
plt.yscale('log')  # 收入数据长尾分布，用对数缩放
plt.title('Income vs Default')
plt.savefig('income_vs_default.png')
plt.close()

# 3. 类别特征与违约的关系（以婚姻状态为例）
marriage_default = train.groupby('NAME_FAMILY_STATUS')['TARGET'].mean().sort_values()
plt.figure(figsize=(10, 6))
marriage_default.plot(kind='bar')
plt.title('Default Rate by Marriage Status')
plt.ylabel('Default Rate')
plt.savefig('marriage_vs_default.png')
plt.close()

# 4. 特征相关性分析（识别多重共线性）
corr = train[num_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr.iloc[:15, :15], annot=True, cmap='coolwarm', fmt='.2f')  # 取前15列可视化
plt.title('Correlation Matrix of Numeric Features')
plt.savefig('correlation_matrix.png')
plt.close()

特征工程（风控核心环节）

In [5]:
# 1. 类别特征编码（风控常用LabelEncoder）
from sklearn.preprocessing import LabelEncoder

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))  # 避免空值影响

# 2. 衍生特征（体现业务理解）
# 例1：收入与信贷金额比（反映还款压力）
train['INCOME_CREDIT_RATIO'] = train['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
# 例2：家庭人均收入
train['PERSONAL_INCOME'] = train['AMT_INCOME_TOTAL'] / (train['CNT_FAM_MEMBERS'] + 1)

# 3. 特征筛选（基于IV值，风控中用于衡量特征预测能力）
def calculate_iv(df, feature, target):
    """计算特征的信息价值（IV）"""
    binned = pd.qcut(df[feature], 10, duplicates='drop')  # 分10箱
    grouped = df.groupby(binned)[target].agg(['count', 'sum'])
    grouped['bad'] = grouped['sum']
    grouped['good'] = grouped['count'] - grouped['sum']
    grouped['bad_rate'] = grouped['bad'] / grouped['bad'].sum()
    grouped['good_rate'] = grouped['good'] / grouped['good'].sum()
    grouped['iv'] = (grouped['bad_rate'] - grouped['good_rate']) * np.log((grouped['bad_rate'] + 1e-10) / (grouped['good_rate'] + 1e-10))
    return grouped['iv'].sum()

# 计算所有特征的IV值，筛选IV>0.1的特征（风控中IV>0.1为有效特征）
iv_values = {}
for col in num_cols:
    if col != 'TARGET':
        iv = calculate_iv(train, col, 'TARGET')
        iv_values[col] = iv

valid_features = [k for k, v in iv_values.items() if v > 0.1] + list(cat_cols)
print(f"有效特征数量：{len(valid_features)}")

  train['INCOME_CREDIT_RATIO'] = train['AMT_INCOME_TOTAL'] / train['AMT_CREDIT']
  train['PERSONAL_INCOME'] = train['AMT_INCOME_TOTAL'] / (train['CNT_FAM_MEMBERS'] + 1)
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target]

有效特征数量：20


  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])


模型训练与评估

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # 风控基础模型
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix

# 划分训练集和测试集
X = train[valid_features]
y = train['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 训练逻辑回归模型（风控中可解释性优先）
model = LogisticRegression(max_iter=1000, class_weight='balanced')  # 处理样本不平衡
model.fit(X_train, y_train)

# 评估AUC（风控核心指标）
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"测试集AUC：{auc:.4f}")

# 绘制ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Default Prediction')
plt.legend()
plt.savefig('roc_curve.png')
plt.close()

测试集AUC：0.4731
