In [2]:
import pandas as pd

# 读取数据集
loan_data = pd.read_csv('archive/loan_data.csv')

# 检查数据的基本信息
print(loan_data.info())

# 查看前几行数据
print(loan_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB
None
   credit.policy             purpose  int.rate  installmen

数据集列和定义：

credit.policy：如果客户满足 LendingClub.com 的信用承销标准，则为 1，否则为 0。

purpose：贷款的用途（取值 “credit_card”、“debt_consolidation”、“educational”、“major_purchase”、“small_business” 和 “all_other”）。

int.rate：贷款的利率，以比例表示（11% 的利率将存储为 0.11）。被 LendingClub.com 判断为风险更高的借款人被分配更高的利率。

installment：如果贷款有资金，则借款人每月欠下的分期付款。

log.annual.inc：借款人自我报告的年收入的自然对数。

dti：借款人的债务收入比（债务金额除以年收入）。

fico：借款人的 FICO 信用评分。

days.with.cr.line：借款人拥有信用额度的天数。

revol.bal：借款人的循环余额（信用卡账单周期结束时未支付的金额）。

revol.util：借款人的循环额度利用率（已使用的信用额度相对于可用信用总额的金额）。

inq.last.6mths： 借款人在过去 6 个月内被债权人查询的次数。

delinq.2yrs：借款人在过去 2 年中逾期 30+ 天的还款次数。

not.full.paid:借款人是否会得到全额付款。
    
pub.rec：借款人的贬损性公共记录（破产申请、税收留置权或判决）的数量。

In [None]:
# 先卸载NumPy和相关依赖
!pip uninstall numpy matplotlib pandas

# 然后重新安装
!pip install numpy matplotlib pandas

In [4]:
import pandas as pd

# 读取数据集
loan_data = pd.read_csv('archive/loan_data.csv')

# 检查数据的基本信息
print(loan_data.info())

# 查看前几行数据
print(loan_data.head())
"""
个人信用风险评估数据预处理完整流程
包含：数据清洗、特征工程、标准化、可视化验证
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')
# 设置中文显示（解决中文乱码问题）
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统使用黑体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
# 读取数据
loan_data = pd.read_csv('archive/loan_data.csv')

# 数据快照
print("\n🔍 前3行数据示例:")
display(loan_data.head(3))

# ======================
# 2. 数据清洗
# ======================
def clean_data(df):
    """数据清洗主函数"""
    # 2.1 处理缺失值（虽然原始数据无缺失，但作为安全措施）
    df_clean = df.dropna().copy()
    
    # 2.2 处理异常值
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        q1 = df[col].quantile(0.05)  # 使用5%分位数替代1%避免过度修剪
        q3 = df[col].quantile(0.95)
        df_clean[col] = df[col].clip(lower=q1, upper=q3)
    
    # 2.3 修正列名格式
    df_clean.columns = [col.replace('.', '_') for col in df_clean.columns]
    
    return df_clean

loan_data = clean_data(loan_data)
print("\n🧹 清洗后数据维度:", loan_data.shape)

# ======================
# 3. 特征工程
# ======================
def feature_engineering(df):
    """特征工程处理"""
    # 3.1 编码类别变量
    le = LabelEncoder()
    df['purpose_encoded'] = le.fit_transform(df['purpose'])
    
    # 3.2 创建衍生特征
    df['credit_utilization'] = df['revol_bal'] / (df['installment'] * 12 + 1e-6)  # 防止除零
    df['income_to_installment'] = np.exp(df['log_annual_inc']) / (df['installment'] * 12 + 1e-6)
    df['credit_history_years'] = df['days_with_cr_line'] / 365
    
    # 3.3 交互特征
    df['fico_dti_interaction'] = df['fico'] * df['dti']
    
    return df, le

loan_data, label_encoder = feature_engineering(loan_data)

# 显示新增特征
print("\n✨ 新增特征示例:")
display(loan_data[['credit_utilization', 'income_to_installment', 'fico_dti_interaction']].head(3))

# ======================
# 4. 特征选择
# ======================
# 定义最终特征集
selected_features = [
    'credit_policy', 'purpose_encoded', 'int_rate', 'installment',
    'log_annual_inc', 'dti', 'fico', 'credit_history_years', 'revol_bal',
    'revol_util', 'inq_last_6mths', 'delinq_2yrs', 'pub_rec',
    'credit_utilization', 'income_to_installment', 'fico_dti_interaction'
]

target = 'not_fully_paid'

# 特征重要性预分析
mi_scores = mutual_info_classif(loan_data[selected_features], loan_data[target])
mi_df = pd.DataFrame({'feature': selected_features, 'mi_score': mi_scores}) \
       .sort_values('mi_score', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='mi_score', y='feature', data=mi_df)
plt.title('特征与目标变量的互信息得分')
plt.tight_layout()
plt.show()

# ======================
# 5. 数据标准化与分割
# ======================
# 5.1 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
    loan_data[selected_features], 
    loan_data[target],
    test_size=0.3, 
    random_state=42,
    stratify=loan_data[target]  # 保持类别比例
)

# 5.2 标准化处理（保留原始数据副本）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为DataFrame保持列名
X_train_df = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_df = pd.DataFrame(X_test_scaled, columns=selected_features)

print(f"\n📊 训练集形状: {X_train_df.shape}, 测试集形状: {X_test_df.shape}")
print("标准化后训练集统计:")
display(X_train_df.describe().round(2))

# ======================
# 7. 预处理结果可视化
# ======================
# 7.1 数值特征分布对比
plt.figure(figsize=(15,10))
for i, col in enumerate(['fico', 'dti', 'credit_utilization'], 1):
    plt.subplot(2,2,i)
    sns.kdeplot(data=loan_data, x=col, hue='not_fully_paid', fill=True)
    plt.title(f'{col} 分布对比')
plt.tight_layout()
plt.show()

# 7.2 类别特征分析
plt.figure(figsize=(12,6))
sns.countplot(data=loan_data, x='purpose', hue='not_fully_paid')
plt.title('不同贷款用途的违约情况')
plt.xticks(rotation=45)
plt.show()

# ======================
# 8. 保存预处理结果
# ======================
preprocessed_data = {
    'X_train': X_train_df,
    'y_train': y_train,
    'X_test': X_test_df,
    'y_test': y_test,
    'scaler': scaler,
    'label_encoder': label_encoder,
    'selected_features': selected_features
}

import joblib
joblib.dump(preprocessed_data, 'preprocessed_loan_data.joblib')
print("\n💾 预处理结果已保存为 preprocessed_loan_data.joblib")

# 最终数据结构确认
print("\n🎉 预处理完成！最终数据结构:")
print(f"训练集: {X_train_df.shape}, 测试集: {X_test_df.shape}")
print(f"特征数: {len(selected_features)}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "D:\anaconda\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "D:\anaconda\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "D:\anaconda\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start
    self.io_loop.start()
  File "D:\anaconda\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [17]:
# -*- coding: utf-8 -*-
"""
个人信用风险评估数据预处理完整流程
包含：数据清洗、特征工程、标准化、样本平衡、可视化验证
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

# ======================
# 1. 数据加载与初步检查
# ======================
def load_data(filepath):
    """安全加载数据函数"""
    try:
        df = pd.read_csv(filepath)
        print(f"✅ 数据加载成功，原始维度: {df.shape}")
        return df
    except Exception as e:
        print(f"❌ 加载失败: {str(e)}")
        return None

loan_data = load_data('archive/loan_data.csv')

# 数据快照
print("\n🔍 前3行数据示例:")
display(loan_data.head(3))

# ======================
# 2. 数据清洗
# ======================
def clean_data(df):
    """数据清洗主函数"""
    # 2.1 处理缺失值（虽然原始数据无缺失，但作为安全措施）
    df_clean = df.dropna().copy()
    
    # 2.2 处理异常值
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        q1 = df[col].quantile(0.05)  # 使用5%分位数替代1%避免过度修剪
        q3 = df[col].quantile(0.95)
        df_clean[col] = df[col].clip(lower=q1, upper=q3)
    
    # 2.3 修正列名格式
    df_clean.columns = [col.replace('.', '_') for col in df_clean.columns]
    
    return df_clean

loan_data = clean_data(loan_data)
print("\n🧹 清洗后数据维度:", loan_data.shape)

# ======================
# 3. 特征工程
# ======================
def feature_engineering(df):
    """特征工程处理"""
    # 3.1 编码类别变量
    le = LabelEncoder()
    df['purpose_encoded'] = le.fit_transform(df['purpose'])
    
    # 3.2 创建衍生特征
    df['credit_utilization'] = df['revol_bal'] / (df['installment'] * 12 + 1e-6)  # 防止除零
    df['income_to_installment'] = np.exp(df['log_annual_inc']) / (df['installment'] * 12 + 1e-6)
    df['credit_history_years'] = df['days_with_cr_line'] / 365
    
    # 3.3 交互特征
    df['fico_dti_interaction'] = df['fico'] * df['dti']
    
    return df, le

loan_data, label_encoder = feature_engineering(loan_data)

# 显示新增特征
print("\n✨ 新增特征示例:")
display(loan_data[['credit_utilization', 'income_to_installment', 'fico_dti_interaction']].head(3))

# ======================
# 4. 特征选择
# ======================
# 定义最终特征集
selected_features = [
    'credit_policy', 'purpose_encoded', 'int_rate', 'installment',
    'log_annual_inc', 'dti', 'fico', 'credit_history_years', 'revol_bal',
    'revol_util', 'inq_last_6mths', 'delinq_2yrs', 'pub_rec',
    'credit_utilization', 'income_to_installment', 'fico_dti_interaction'
]

target = 'not_fully_paid'

# 特征重要性预分析
mi_scores = mutual_info_classif(loan_data[selected_features], loan_data[target])
mi_df = pd.DataFrame({'feature': selected_features, 'mi_score': mi_scores}) \
       .sort_values('mi_score', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='mi_score', y='feature', data=mi_df)
plt.title('特征与目标变量的互信息得分')
plt.tight_layout()
plt.show()

# ======================
# 5. 数据标准化与分割
# ======================
# 5.1 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
    loan_data[selected_features], 
    loan_data[target],
    test_size=0.3, 
    random_state=42,
    stratify=loan_data[target]  # 保持类别比例
)

# 5.2 标准化处理（保留原始数据副本）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 转换为DataFrame保持列名
X_train_df = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_df = pd.DataFrame(X_test_scaled, columns=selected_features)

print(f"\n📊 训练集形状: {X_train_df.shape}, 测试集形状: {X_test_df.shape}")
print("标准化后训练集统计:")
display(X_train_df.describe().round(2))

# ======================
# 6. 处理样本不平衡
# ======================
# 6.1 检查原始分布
print("\n⚖️ 原始类别分布:")
print(y_train.value_counts(normalize=True))

# 6.2 SMOTE过采样
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_df, y_train)

# 6.3 验证平衡效果
print("\n🔄 过采样后类别分布:")
print(pd.Series(y_resampled).value_counts(normalize=True))

# ======================
# 7. 预处理结果可视化
# ======================
# 7.1 数值特征分布对比
plt.figure(figsize=(15,10))
for i, col in enumerate(['fico', 'dti', 'credit_utilization'], 1):
    plt.subplot(2,2,i)
    sns.kdeplot(data=loan_data, x=col, hue='not_fully_paid', fill=True)
    plt.title(f'{col} 分布对比')
plt.tight_layout()
plt.show()

# 7.2 类别特征分析
plt.figure(figsize=(12,6))
sns.countplot(data=loan_data, x='purpose', hue='not_fully_paid')
plt.title('不同贷款用途的违约情况')
plt.xticks(rotation=45)
plt.show()

# ======================
# 8. 保存预处理结果
# ======================
preprocessed_data = {
    'X_train': X_resampled,
    'y_train': y_resampled,
    'X_test': X_test_df,
    'y_test': y_test,
    'scaler': scaler,
    'label_encoder': label_encoder,
    'selected_features': selected_features
}

import joblib
joblib.dump(preprocessed_data, 'preprocessed_loan_data.joblib')
print("\n💾 预处理结果已保存为 preprocessed_loan_data.joblib")

# 最终数据结构确认
print("\n🎉 预处理完成！最终数据结构:")
print(f"训练集: {X_resampled.shape}, 测试集: {X_test_df.shape}")
print(f"特征数: {len(selected_features)}")

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (D:\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py)

In [18]:
import imblearn
import sklearn
print("imbalanced-learn 版本:", imblearn.__version__)
print("scikit-learn 版本:", sklearn.__version__)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (D:\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py)

In [19]:
!pip install -U imbalanced-learn --user

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting imbalanced-learn


  ERROR: HTTP error 403 while getting https://pypi.tuna.tsinghua.edu.cn/packages/9d/41/721fec82606242a2072ee909086ff918dfad7d0199a9dfd4928df9c72494/imbalanced_learn-0.13.0-py3-none-any.whl (from https://pypi.tuna.tsinghua.edu.cn/simple/imbalanced-learn/) (requires-python:>=3.10)
ERROR: Could not install requirement imbalanced-learn from https://pypi.tuna.tsinghua.edu.cn/packages/9d/41/721fec82606242a2072ee909086ff918dfad7d0199a9dfd4928df9c72494/imbalanced_learn-0.13.0-py3-none-any.whl because of HTTP error 403 Client Error: Forbidden for url: https://pypi.tuna.tsinghua.edu.cn/packages/9d/41/721fec82606242a2072ee909086ff918dfad7d0199a9dfd4928df9c72494/imbalanced_learn-0.13.0-py3-none-any.whl for URL https://pypi.tuna.tsinghua.edu.cn/packages/9d/41/721fec82606242a2072ee909086ff918dfad7d0199a9dfd4928df9c72494/imbalanced_learn-0.13.0-py3-none-any.whl (from https://pypi.tuna.tsinghua.edu.cn/simple/imbalanced-learn/) (requires-python:>=3.10)


In [20]:
!pip install -U imbalanced-learn --user --no-cache-dir --index-url https://pypi.org/simple

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/9d/41/721fec82606242a2072ee909086ff918dfad7d0199a9dfd4928df9c72494/imbalanced_learn-0.13.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Obtaining dependency information for scikit-learn<2,>=1.3.2 from https://files.pythonhosted.org/packages/f4/5a/ba91b8c57aa37dbd80d5ff958576a9a8c14317b04b671ae7f0d09b00993a/scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Obtaining dependency information for sklearn-compat<1,>=0.1 from https://files.pythonhosted.org/packages/f0/a8/ad69cf130fbd017660cdd64abbef3f28135d9e2e15fe3002e03c5be0ca38/sklearn_compat-0.1.3-py3-none-any.whl.metadata
  Downloading sklearn_compat-0.1.3-py3-n

In [21]:
import imblearn
print("imbalanced-learn 版本:", imblearn.__version__)  # 应显示 0.10.1+

ModuleNotFoundError: No module named 'imblearn.utils._sklearn_compat'

In [22]:
!pip uninstall -y imbalanced-learn scikit-learn sklearn-compat
!pip install imbalanced-learn==0.10.1 scikit-learn==1.2.2 --user

Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: sklearn-compat 0.1.3
Uninstalling sklearn-compat-0.1.3:
  Successfully uninstalled sklearn-compat-0.1.3
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting scikit-learn==1.2.2


  ERROR: HTTP error 403 while getting https://pypi.tuna.tsinghua.edu.cn/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (from https://pypi.tuna.tsinghua.edu.cn/simple/scikit-learn/) (requires-python:>=3.8)
ERROR: Could not install requirement scikit-learn==1.2.2 from https://pypi.tuna.tsinghua.edu.cn/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl because of HTTP error 403 Client Error: Forbidden for url: https://pypi.tuna.tsinghua.edu.cn/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl for URL https://pypi.tuna.tsinghua.edu.cn/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (from https://pypi.tuna.tsinghua.edu.cn/simple/scikit-learn/) (requires-python:>=3.8)


In [23]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1 --user --no-cache-dir --index-url https://pypi.org/simple

Collecting scikit-learn==1.2.2
  Obtaining dependency information for scikit-learn==1.2.2 from https://files.pythonhosted.org/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB 330.3 kB/s eta 0:00:25
   ---------------------------------------- 0.0/8.3 MB 393.8 kB/s eta 0:00:21
   ---------------------------------------- 0.1/8.3 MB 365.7 kB/s eta 0:00:23
   ---------------------------------------- 0.1/8.3 MB 403.5 kB/s eta 0:00:21
   ---------------------------------------- 0.1/8.3 MB 368.6 kB/s eta 0:00:23
    --------------------------------------- 0.1/8.3 MB 400.9 kB/s eta 0:00

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.4 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.
