# 1.初步处理

In [1]:
 import pandas as pd

german_credit_data = pd.read_csv('add1.csv')
australian_credit_data = pd.read_csv('add2.csv')

german_credit_data, australian_credit_data

(     X1  X2  X3  X4  X5  X6  X7  X8  X9  X10  ...  X16  X17  X18  X19  X20  \
 0     1   6   4  12   5   5   3   4   1   67  ...    0    0    1    0    0   
 1     2  48   2  60   1   3   2   2   1   22  ...    0    0    1    0    0   
 2     4  12   4  21   1   4   3   3   1   49  ...    0    0    1    0    0   
 3     1  42   2  79   1   4   3   4   2   45  ...    0    0    0    0    0   
 4     1  24   3  49   1   3   3   4   4   53  ...    1    0    1    0    0   
 ..   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...  ...  ...   
 995   4  12   2  17   1   4   2   4   1   31  ...    0    0    1    0    0   
 996   1  30   2  39   1   3   1   4   2   40  ...    0    1    1    0    0   
 997   4  12   2   8   1   5   3   4   3   38  ...    0    0    1    0    0   
 998   1  45   2  18   1   3   3   4   4   23  ...    0    0    1    0    0   
 999   2  45   4  46   2   1   3   4   3   27  ...    0    1    1    0    0   
 
      X21  X22  X23  X24  Y(1=default, 0=non-defau

# 2.现有模型

## 2.1 DT

In [191]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# 评估模型并返回所需指标的函数
def evaluate_model(data, model):
    X = data.drop(columns=['Y(1=default, 0=non-default)']).values
    y = data['Y(1=default, 0=non-default)'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    type1_error = conf_matrix[0, 1] / conf_matrix.sum()
    type2_error = conf_matrix[1, 0] / conf_matrix.sum()
    return accuracy, auc, type1_error, type2_error

# 初始化分类器
dt_classifier = DecisionTreeClassifier(random_state=42)

# 德国数据集评估
german_results = evaluate_model(german_credit_data, dt_classifier)

# 澳大利亚数据集评估
australian_results = evaluate_model(australian_credit_data, dt_classifier)

# 创建结果表格
german_table = pd.DataFrame({
    "模型": ["DT"],
    "Accuracy": [german_results[0]],
    "AUC": [german_results[1]],
    "Type1-error": [german_results[2]],
    "Type2-error": [german_results[3]]
})

australian_table = pd.DataFrame({
    "模型": ["DT"],
    "Accuracy": [australian_results[0]],
    "AUC": [australian_results[1]],
    "Type1-error": [australian_results[2]],
    "Type2-error": [australian_results[3]]
})

# 保存到CSV
german_table_path = '德国.csv'
australian_table_path = '澳大利亚.csv'
german_table.to_csv(german_table_path, index=False, encoding='utf-8-sig')
australian_table.to_csv(australian_table_path, index=False, encoding='utf-8-sig')

## 2.2 KNN

In [192]:
from sklearn.neighbors import KNeighborsClassifier

# 初始化
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# 德国
german_knn_results = evaluate_model(german_credit_data, knn_classifier)

# 澳大利亚
australian_knn_results = evaluate_model(australian_credit_data, knn_classifier)

# 更新结果表格
german_table = pd.concat([german_table, pd.DataFrame({
    "模型": ["KNN"],
    "Accuracy": [german_knn_results[0]],
    "AUC": [german_knn_results[1]],
    "Type1-error": [german_knn_results[2]],
    "Type2-error": [german_knn_results[3]]
})], ignore_index=True)

australian_table = pd.concat([australian_table, pd.DataFrame({
    "模型": ["KNN"],
    "Accuracy": [australian_knn_results[0]],
    "AUC": [australian_knn_results[1]],
    "Type1-error": [australian_knn_results[2]],
    "Type2-error": [australian_knn_results[3]]
})], ignore_index=True)

# 保存
german_table.to_csv(german_table_path, index=False, encoding='utf-8-sig')
australian_table.to_csv(australian_table_path, index=False, encoding='utf-8-sig')

## 2.3 RF

In [193]:
from sklearn.ensemble import RandomForestClassifier

# 初始化
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 德国
german_rf_results = evaluate_model(german_credit_data, rf_classifier)

# 澳大利亚
australian_rf_results = evaluate_model(australian_credit_data, rf_classifier)

# 更新
german_table = pd.concat([german_table, pd.DataFrame({
    "模型": ["RF"],
    "Accuracy": [german_rf_results[0]],
    "AUC": [german_rf_results[1]],
    "Type1-error": [german_rf_results[2]],
    "Type2-error": [german_rf_results[3]]
})], ignore_index=True)

australian_table = pd.concat([australian_table, pd.DataFrame({
    "模型": ["RF"],
    "Accuracy": [australian_rf_results[0]],
    "AUC": [australian_rf_results[1]],
    "Type1-error": [australian_rf_results[2]],
    "Type2-error": [australian_rf_results[3]]
})], ignore_index=True)

# 保存
german_table.to_csv(german_table_path, index=False, encoding='utf-8-sig')
australian_table.to_csv(australian_table_path, index=False, encoding='utf-8-sig')

## 2.4 SVC

In [196]:
from sklearn.svm import SVC

# 初始化
svm_classifier = SVC(kernel='linear', random_state=42)

# 德国
german_svm_results = evaluate_model(german_credit_data, svm_classifier)

# 澳大利亚
australian_svm_results = evaluate_model(australian_credit_data, svm_classifier)

# 更新
german_table = pd.concat([german_table, pd.DataFrame({
    "模型": ["SVM"],
    "Accuracy": [german_svm_results[0]],
    "AUC": [german_svm_results[1]],
    "Type1-error": [german_svm_results[2]],
    "Type2-error": [german_svm_results[3]]
})], ignore_index=True)

australian_table = pd.concat([australian_table, pd.DataFrame({
    "模型": ["SVM"],
    "Accuracy": [australian_svm_results[0]],
    "AUC": [australian_svm_results[1]],
    "Type1-error": [australian_svm_results[2]],
    "Type2-error": [australian_svm_results[3]]
})], ignore_index=True)

# 保存
german_table.to_csv(german_table_path, index=False, encoding='utf-8-sig')
australian_table.to_csv(australian_table_path, index=False, encoding='utf-8-sig')

# 3.我们的模型

## 3.1 同第一问，对附件2进行IV值计算

In [28]:
import pandas as pd
import numpy as np

data = pd.read_csv('add2.csv')

# 定义划分区间的函数
def binning_feature(data, feature):
    bins = np.quantile(data[feature].dropna(), [0, 0.2, 0.4, 0.6, 0.8, 1])
    binned_feature = pd.cut(data[feature], bins, labels=False, duplicates='drop')
    return binned_feature

# 定义计算IV的函数
def calculate_iv(data, feature, target):
    total_bad = data[target].sum()
    total_good = data[target].count() - total_bad
    grouped = data.groupby(feature)[target].agg(['sum', 'count'])
    grouped.columns = ['bad', 'total']
    grouped['good'] = grouped['total'] - grouped['bad']

    # 避免分母为零的情况
    grouped['Bi'] = np.where(grouped['bad'] == 0, 0.00001, grouped['bad'] / total_bad)
    grouped['Gi'] = np.where(grouped['good'] == 0, 0.00001, grouped['good'] / total_good)

    grouped['WOE'] = np.log(grouped['Bi'] / grouped['Gi'])
    grouped['IV'] = (grouped['Bi'] - grouped['Gi']) * grouped['WOE']

    feature_iv = grouped['IV'].sum()
    return feature_iv

# 为特定的连续特征划分区间
for feature in ['X2', 'X3', 'X7', 'X13', 'X14']:
    data[feature] = binning_feature(data, feature)

# 计算所有特征的IV值
iv_results = {}
for col in data.columns:
    if col != 'Y(1=default, 0=non-default)': 
        iv_results[col] = calculate_iv(data, col, 'Y(1=default, 0=non-default)')

iv_df = pd.DataFrame.from_dict(iv_results, orient='index', columns=['IV Value'])
iv_df.to_csv('IV值2.csv', index=True)

iv_df

Unnamed: 0,IV Value
X1,0.000781
X2,0.124104
X3,0.231719
X4,0.193455
X5,0.659985
X6,0.302602
X7,0.640154
X8,2.818864
X9,0.910995
X10,2.121855


In [24]:
def categorize_iv(value):
    if value < 0.02:
        return '无用特征'
    elif 0.02 <= value < 0.1:
        return '弱价值特征'
    elif 0.1 <= value < 0.3:
        return '中价值特征'
    elif 0.3 <= value < 0.5:
        return '强价值特征'
    else:
        return '价值过高，不真实'

# 存储标签
iv_df['价值分类'] = iv_df['IV Value'].apply(categorize_iv)

# 筛选
filtered_iv_df = iv_df[iv_df['价值分类'].isin(['中价值特征', '强价值特征'])]

filtered_data_path = '筛选2.csv'
filtered_iv_df.to_csv(filtered_data_path, index=True)

filtered_iv_df

Unnamed: 0,IV Value,价值分类
X2,0.124104,中价值特征
X3,0.231719,中价值特征
X4,0.193455,中价值特征
X6,0.302602,强价值特征
X13,0.164355,中价值特征


In [27]:
data = pd.read_csv('add2.csv')

#从源文件中提取筛选后的数据
selected_features = [
    'X2', 'X3', 'X4', 'X6', 'X13', 'Y(1=default, 0=non-default)'
]
filtered_data = data[selected_features]

output_path = '筛选add2.csv'
filtered_data.to_csv(output_path, index=False)

filtered_data

Unnamed: 0,X2,X3,X4,X6,X13,"Y(1=default, 0=non-default)"
0,22.08,11.460,2,4,100,1
1,22.67,7.000,2,4,160,1
2,29.58,1.750,1,4,280,1
3,21.67,11.500,1,3,0,0
4,20.17,8.170,2,4,60,0
...,...,...,...,...,...,...
685,31.57,10.500,2,4,0,0
686,20.67,0.415,2,4,0,1
687,18.83,9.540,2,4,100,0
688,27.42,14.500,2,8,120,0


## 3.2 模型构建与运行（在阿里云上运行）

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

german_data_path = '筛选add1.csv' 
australian_data_path = '筛选add2.csv' 
german_data = pd.read_csv(german_data_path)
australian_data = pd.read_csv(australian_data_path)

# 数据预处理函数
def preprocess_data(data):
    scaler = MinMaxScaler()
    X = scaler.fit_transform(data.drop(columns=['Y(1=default, 0=non-default)']))
    y = data['Y(1=default, 0=non-default)'].values
    return X, y

# 构建模型函数
def build_model(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 评估模型函数
def evaluate_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    type1_error = conf_matrix[0, 1] / conf_matrix.sum()
    type2_error = conf_matrix[1, 0] / conf_matrix.sum()
    return accuracy, auc, type1_error, type2_error

X_german, y_german = preprocess_data(german_data)
X_australian, y_australian = preprocess_data(australian_data)

german_model = build_model(X_german.shape[1])
australian_model = build_model(X_australian.shape[1])

german_results = evaluate_model(X_german, y_german, german_model)
australian_results = evaluate_model(X_australian, y_australian, australian_model)

# 读取结果表格
german_table = pd.read_csv('德国.csv')  
australian_table = pd.read_csv('澳大利亚.csv') 

# 保存到CSV
german_table = german_table.append({'模型': '我们的模型', 
                                    'Accuracy': german_results[0],
                                    'AUC': german_results[1],
                                    'Type1-error': german_results[2],
                                    'Type2-error': german_results[3]}, 
                                   ignore_index=True)

australian_table = australian_table.append({'模型': '我们的模型', 
                                            'Accuracy': australian_results[0],
                                            'AUC': australian_results[1],
                                            'Type1-error': australian_results[2],
                                            'Type2-error': australian_results[3]}, 
                                           ignore_index=True)

german_table.to_csv(german_table_path, index=False, encoding='utf-8-sig')
australian_table.to_csv(australian_table_path, index=False, encoding='utf-8-sig')