In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore') 

%matplotlib inline

#加载数据
test=pd.read_csv('data/cs-test.csv')
train=pd.read_csv('data/cs-training.csv')

In [2]:
#评价指标是AUC，这里定义计算AUC函数
def computeAUC(X,Y):
    auc = roc_auc_score(X,Y)
    print("auc=",auc)
    return auc

In [4]:
#查看数据情况
train.info()
train.describe()
#统计空值
train.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
Unnamed: 0                              150000 non-null int64
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(8)
memory usage: 13.7 MB


Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [3]:
#处理异常值
train[train['age']<18]
train.loc[train['age']==0,'age']=train['age'].median()
      
#吧样本分为退休与工作两类
age_working=train.loc[(train['age']>=18) & (train['age']<60)]
age_senior=train.loc[train['age']>=60]

#用收入的平均值填充缺失值
age_working_income=age_working['MonthlyIncome'].mean()
age_senior_income=age_senior['MonthlyIncome'].mean()
train["MonthlyIncome"] = train["MonthlyIncome"].fillna(99999)
train.loc[((train['age']>=18) & (train['age']<60))&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_working_income#用对应平均数填充
train.loc[(train['age']>=60)&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_senior_income
train["MonthlyIncome"] = train["MonthlyIncome"].astype('int64')

#简单处理抚养人数的缺失值
train["NumberOfDependents"] = train["NumberOfDependents"].fillna(0)
train["NumberOfDependents"] = train["NumberOfDependents"].astype('int64')
train["NumberOfDependents"].value_counts()

0     90826
1     26316
2     19522
3      9483
4      2862
5       746
6       158
7        51
8        24
10        5
9         5
20        1
13        1
Name: NumberOfDependents, dtype: int64

In [4]:
#把三类违约集合处理
train["CombinedDefaulted"]=train['NumberOfTime30-59DaysPastDueNotWorse']+train['NumberOfTime60-89DaysPastDueNotWorse']+train['NumberOfTimes90DaysLate']
train.loc[train["CombinedDefaulted"]>=1,"CombinedDefaulted"]=1
train["CombinedCreditLoans"] = train["NumberOfOpenCreditLinesAndLoans"] +train["NumberRealEstateLoansOrLines"]
train.loc[(train["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
train.loc[(train["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1
train["CombinedCreditLoans"].value_counts()

#处理扶养人数特征
train["WithDependents"] = train["NumberOfDependents"]
train.loc[(train["WithDependents"] >= 1), "WithDependents"] = 1
train["WithDependents"].value_counts()

#每月债务支出
train["MonthlyDebtPayments"] = train["DebtRatio"] * train["MonthlyIncome"]
train["MonthlyDebtPayments"] = np.absolute(train["MonthlyDebtPayments"])
train["MonthlyDebtPayments"] = train["MonthlyDebtPayments"].astype('int64')
train['age'].astype('int64')
train["MonthlyDebtPayments"].astype('int64')

train["age_map"] = train["age"]
train.loc[(train["age"] >= 18) & (train["age"] < 60), "age_map"] = 1
train.loc[(train["age"] >= 60), "age_map"] = 0

#替换为分类特征，然后获得虚拟变量
train["age_map"] = train["age_map"].replace(0, "working")
train["age_map"] = train["age_map"].replace(1, "senior")
train= pd.concat([train, pd.get_dummies(train.age_map,prefix='is')], axis=1)

In [None]:
#查看关联系数，决定保留哪些特征
corr = train.corr()
corr['SeriousDlqin2yrs'].sort_values(ascending=False)

In [5]:
#查看过关联系数后，删除关联系小的特征
train.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans","NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse","WithDependents","age_map","is_senior","is_working", "MonthlyDebtPayments"], axis=1, inplace=True)

In [6]:
#同理处理测试集
def cleaned_dataset(dataset):
    dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()
    age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
    age_senior = dataset.loc[(dataset["age"] >= 60)]

    age_working_impute = age_working.MonthlyIncome.mean()
    age_senior_impute = age_senior.MonthlyIncome.mean()

    dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
    dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')

    dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),"MonthlyIncome"] = age_working_impute
    dataset.loc[(train["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
    dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
    dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')

    dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"]) + dataset["NumberOfTime30-59DaysPastDueNotWorse"]
    dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

    dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + dataset["NumberRealEstateLoansOrLines"]
    dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
    dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

    dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans","NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)

cleaned_dataset(test)

In [7]:
#分离
X = train.drop("SeriousDlqin2yrs", axis=1).copy()
y = train.SeriousDlqin2yrs
X_test = test.drop("SeriousDlqin2yrs", axis=1).copy()
y_test = test.SeriousDlqin2yrs

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import xgboost as xgb

In [9]:
#分离训练集，验证集
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42)

#标准化处理
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [11]:
#模型比较
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
predicted_probs_train = lr.predict_proba(X_train_scaled)
predicted_probs_train = [x[1] for  x in predicted_probs_train]
computeAUC(y_train, predicted_probs_train)

auc= 0.8033929109526973


0.8033929109526973

In [12]:
predicted_probs_test_new = lr.predict_proba(X_val_scaled)
predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
computeAUC(y_val, predicted_probs_test_new)

auc= 0.8014449476924851


0.8014449476924851

In [16]:
model = tree.DecisionTreeClassifier()    
model.fit(X_train_scaled, y_train)
predicted_probs_train = model.predict_proba(X_train_scaled)
predicted_probs_train = [x[1] for  x in predicted_probs_train]
computeAUC(y_train, predicted_probs_train)

auc= 0.9999972112505437


0.9999972112505437

In [17]:
predicted_probs_test_new = lr.predict_proba(X_val_scaled)
predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
computeAUC(y_val, predicted_probs_test_new)

auc= 0.8014449476924851


0.8014449476924851

In [19]:
#用网格搜索调整参数
rf = RandomForestClassifier(n_estimators=100,
                            oob_score= True,
                            min_samples_split=2,
                            min_samples_leaf=50,
                            n_jobs=-1,
                            class_weight='balanced_subsample',
                            bootstrap=True)
param_grid = {"max_features": [2, 3, 4], "min_samples_leaf":[50]}
grid_search = GridSearchCV(rf, cv=10, scoring='roc_auc', param_grid=param_grid, iid=False)
grid_search.fit(X_train_scaled, y_train)
print("the best parameter:", grid_search.best_params_)
print("the best score:", grid_search.best_score_)

the best parameter: {'max_features': 3, 'min_samples_leaf': 50}
the best score: 0.8534636245352856


In [26]:
使用训练的模型来预测X_train数据
predicted_probs_train = grid_search.predict_proba(X_train_scaled)
predicted_probs_train = [x[1] for  x in predicted_probs_train]
computeAUC(y_train, predicted_probs_train)

In [25]:
#使用训练的模型来预测X_val数据
predicted_probs_val = grid_search.predict_proba(X_val)
predicted_probs_val = [x[1] for x in predicted_probs_val]
computeAUC(y_val, predicted_probs_val)

In [24]:
#使用该模型预测test
predicted_probs_test = grid_search.predict_proba(X_test)
predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test]
submission = pd.DataFrame({'Id':range(1,101504), 'Probability':predicted_probs_test})
submission.to_csv("predict_proba.csv", index=False)