# *Load Google Drive*

# 数据分析

## 查看数据

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/train.csv')
testA = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/testA.csv')

print(data.head())
print(testA.head())

   id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0   0   35000.0     5         19.52       917.97  ...   7.0  0.0  0.0  0.0  2.0
1   1   18000.0     5         18.49       461.90  ...  13.0  NaN  NaN  NaN  NaN
2   2   12000.0     5         16.99       298.17  ...  11.0  0.0  0.0  0.0  4.0
3   3   11000.0     3          7.26       340.96  ...   9.0  0.0  0.0  0.0  1.0
4   4    3000.0     3         12.99       101.07  ...  12.0  0.0  0.0  0.0  4.0

[5 rows x 47 columns]
       id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0  800000   14000.0     3         10.99       458.28  ...  17.0  0.0  0.0  1.0  3.0
1  800001   20000.0     5         14.65       472.14  ...   5.0  0.0  0.0  2.0  2.0
2  800002   12000.0     3         19.99       445.91  ...  12.0  0.0  0.0  0.0  7.0
3  800003   17500.0     5         14.31       410.02  ...  10.0  0.0  0.0  0.0  3.0
4  800004   35000.0     3         17.09      1249.42  ...  19.0  0.0  0.0  0.

In [None]:
print(data.columns)

print(testA.columns)

for i in data.columns:
  if i not in testA.columns:
    print(i)  


Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcie

## 数据处理

# 特征工程

## 通用流程

数据预处理
  * 缺失值填充
  * 时间格式处理
  * 对象类型特征转换为数值

异常值处理
  * 基于3σ原则
  * 基于箱型图

数据分箱
  * 固定宽度分箱
  * 分位数分箱
    * 离散数值型数据分箱
    * 连续数值型数据分箱
  * 卡方分箱

特征交互
  * 特征和特征之间组合
  * 特征和特征之间衍生
  * 其他特征衍生的尝试

特征编码
  * one-hot编码
  * label-encode编码

特征选择
  * Filter
  * Wrapper
  * Embedded





## 代码实践

### 导入包并读取数据

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from tqdm import tqdm 
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

warnings.filterwarnings('ignore')


In [3]:
!sudo pip3 install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 89kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [5]:
data_train = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/train.csv')
data_test = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/testA.csv')


### 特征预处理

基于EDA分析数据存在的问题，并解决。包括：缺失值填充，时间格式特征转化，某些对象类别特征的处理


In [11]:
# 查出对象的类别特征和数值特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea= list(filter(lambda x:x not in numerical_fea, list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

#### 缺失值填充
（数值型特征处理）

In [8]:
# 缺失值填充
data_train = data_train.fillna(axis=0, method='ffill', limit=2)  

# 查看缺失值情况
print(data_train.isnull().sum()) 

id                      0
loanAmnt                0
term                    0
interestRate            0
installment             0
grade                   0
subGrade                0
employmentTitle         0
employmentLength      159
homeOwnership           0
annualIncome            0
verificationStatus      0
issueDate               0
isDefault               0
purpose                 0
postCode                0
regionCode              0
dti                     0
delinquency_2years      0
ficoRangeLow            0
ficoRangeHigh           0
openAcc                 0
pubRec                  0
pubRecBankruptcies      0
revolBal                0
revolUtil               0
totalAcc                0
initialListStatus       0
applicationType         0
earliesCreditLine       0
title                   0
policyCode              0
n0                    118
n1                    118
n2                    118
n3                    118
n4                     70
n5                    118
n6          

In [13]:
# 按照中位数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test[numerical_fea] = data_test[numerical_fea].fillna(data_test[numerical_fea].median())

# 按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test[category_fea] = data_test[category_fea].fillna(data_test[category_fea].mode())


#### 时间特征处理

In [None]:
# 时间格式处理
for data in [data_train,data_test]:
  data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
  startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
  # 构造时间特征
  data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

data_train['employmentLength'].value_counts(dropna=False).sort_index()


#### 数值型文本特征处理

In [24]:
# 对象类型特征转换到数值
#①
def employmentLength_to_int(s):
  if pd.isnull(s):
    return s
  else:
    return np.int8(s.split()[0])

for data in [data_train, data_test]:
  data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
  data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
  data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)


In [26]:

data['employmentLength'].value_counts(dropna=False).sort_index()


0.0     15989
1.0     13182
2.0     18207
3.0     16011
4.0     11833
5.0     12543
6.0      9328
7.0      8823
8.0      8976
9.0      7594
10.0    65772
NaN     11742
Name: employmentLength, dtype: int64

In [28]:
#②
data_train['earliesCreditLine'].sample(5)

for data in [data_train, data_test]:
  data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))


#### 类别特征处理


In [30]:
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']

for f in cate_features:
  print(f,'类型数：', data[f].nunique())


grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 79282
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 889
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 12058
policyCode 类型数： 1


In [None]:
# 等级类别特征，使用labelencode或者自映射
for data in [data_train, data_test]:
  data['grade'] = data['grade'].map({'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7})
  
# 类别数在2以上，又不是高维稀疏的，且纯分类特征
for data in [data_train, data_test]:
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

### 异常值处理

分析异常原因并做对策
  * 不合理异常，和业务目标没关，删除
  * 合理异常，和业务目标有关，保留

Tips: 
  * 对于合理异常，能用监督用监督，不能用的考虑用异常检测算法
  * test的数据不能删除


In [None]:
# 检测异常方法一： 均方差
def find_outliers_by_3segma(data, fea):
  data_std = np.std(data[fea])
  data_mean = np.mean(data[fea])
  outliers_cut_off = data_std*3
  lower_rule = data_mean - outliers_cut_off
  upper_rule = data_mean + outliers_cut_off
  data[fea + '_outliers'] = data[fea].apply(lambda x: str('异常值') if x<lower_rule or x>upper_rule else '正常值')
  return data

for fea in numerical_fea:
  data_train = find_outliers_by_3segma(data_train, fea)
  print(data_train[fea + '_outliers'].value_counts())
  print(data_train.groupby(fea + '_outliers')['isDefault'].sum())
  print('*'*10)

# 删除异常值
for fea in numerical_fea:
  data_train = data_train[data_train[fea+'_outliers']=='正常值']
  data_train = data_train.reset_index(drop=True)


# 检测异常方法二：箱型图
'''
四分位数将数据分为三个点和四个区间，IQR=Q3-Q1，lower_cut_off=Q1-1.5*IQR, upper_cut_off=Q3+1.5*IQR.
'''


正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值        62
正常值    159548
Name: isDefault, dtype: int64
**********
正常值    793973
异常值      

'\n四分位数将数据分为三个点和四个区间，IQR=Q3-Q1，lower_cut_off=Q1-1.5*IQR, upper_cut_off=Q3+1.5*IQR.\n'

### 数据分桶

特征分箱的目的：
  * 降低变量的复杂性，减少变量噪音对模型的影响，提高自变量和因变量的相关度。

特征分桶的对象：
  * 将连续变量离散化
  * 将多状态离散变量合并成少状态

分箱的原因;
  * 数据特征的数值跨度大的数据，应用“距离度量”的方法会有大吃小的影响。

分箱的优点：
  * 处理缺失值：将缺失值单独作为一个分箱
  * 处理异常值：当数据中存在离群点，分箱离散化处理，可以提高变量的鲁棒性
  * 业务解释性：易于进行线性判断变量的作用

分箱的基本原则：
  * 最小分箱不能低于5%
  * 箱内不能全是好样本（好客户）
  * 连续箱单调




In [None]:
# 固定宽度分箱
'''当数据横跨多个数量级时，按照10的幂进行分组：0~9，10~99，100~999，。。。等等'''

## 通过除法映射到间隔均匀的分箱中，每个分箱的数值范围都是LoanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'],1000)

## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor_divide(np.log10(data['loanAmnt']))

# 分位数分箱  
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'],10,labels=False)  # 分十个不同的数据箱

# 卡方分箱及其他分箱方法的尝试


### 特征交互

使用交互特征的代价是训练时间和评分时间从0(n)增加到0(n2)


In [3]:
for col in ['grade', 'subGrade']:
  temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean':col+'_target_mean'})
  temp_dict.index = temp_dict[col].values
  temp_dict = temp_dict[col + '_target_mean'].to_dict()

  data_train[col+'_target_mean'] = data_train[col].map(temp_dict)
  data_test[col+'_target_mean'] = data_test[col].map(temp_dict)

# 其他衍生变量mean和std
for df in [data_train data_test]:
  for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
    df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')  #
    df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

1.0


In [19]:
import pandas as pd
 
df = pd.DataFrame({'Country':['China','China', 'India', 'India', 'America', 'Japan', 'China', 'India'], 
                   'Income':[10000, 10000, 5000, 5002, 40000, 50000, 8000, 5000],
                    'Age':[5000, 4321, 1234, 4010, 250, 250, 4500, 4321]})
print(df)

df_0 = df.groupby(['Country'])['Age'].agg(['mean']).reset_index().rename(columns={'mean':'Country' + '_target_mean'})

print(df_0)

df_1 = df_0.rename(columns={'Country_target_maen':'MEAN'})

print(df_1)

df_2 = df.groupby(['Country'])['Age'].transform('mean')

print(df_2)

   Country  Income   Age
0    China   10000  5000
1    China   10000  4321
2    India    5000  1234
3    India    5002  4010
4  America   40000   250
5    Japan   50000   250
6    China    8000  4500
7    India    5000  4321
   Country  Country_target_mean
0  America           250.000000
1    China          4607.000000
2    India          3188.333333
3    Japan           250.000000
   Country  Country_target_mean
0  America           250.000000
1    China          4607.000000
2    India          3188.333333
3    Japan           250.000000
0    4607.000000
1    4607.000000
2    3188.333333
3    3188.333333
4     250.000000
5     250.000000
6    4607.000000
7    3188.333333
Name: Age, dtype: float64


### 特征编码

In [None]:
# labelEbcode直接放入树模型中
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title', 'subGrade']):
  le = LabelEncoder()
  le.fit(list(data_train[col].astype(str).values) + list(data_test[col].astype(str).values))
  data_train[col] = le.transform(list(data_train[col].astype(str).values))
  data_test[col] = le.transform(list(data_test[col].astype(str).values))
print('Label Encoding Finish!')


In [None]:
# 逻辑回归等模型要单独增加的特征工程
'''
① 对特征做归一化，让训练收敛更快，避免大吃小，
② 去除相关性高的特征，可以增加模型可解释性，加快预测过程
'''
# 举例归一化过程
#伪代码
for fea in [要归一化的特征列表]：
    data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
'''
也可以直接用sklearn的类
归一化：sklearn.preprocessing.MinMaxScaler()
标准化：sklearn.preprocessing.StandardScaler()
正则化：sklearn.preprocessing.Normalize()
'''

### 特征选择

目的：
  * 得到简约的模型

方式：
  * 精简特征，降低模型复杂度

原则：
  * 在不降低准确率的情况下提高计算速度
  * 并非为了减少训练时间，而是为了减少模型评分时间

方法：
  * filter
    * 方差选择法
    * 相关系数法（person相关系数）
    * 卡方检验
    * 互信息法
  * Wrapper（RFE）
    * 递归特征小出发
  * Embedded
    * 基于惩罚项的特征选择法
    * 基于树模型的特征选择


#### filter

基于特征间的关系进行筛选

In [None]:
# 方差选择法：
from sklearn.feature_extraction import VarianceThreshold
VarianceThreshold(threshold=3).fit_transform(train,target_train)

# 相关系数法（pearson相关系数)
from sklearn.feature_slelction import SelectKBest   # 选择K个最好的特征
from scipy.stats import pearsonr

SelectKBest(k=5).fit_transform(train, target_train)

# 卡方检验
'''
卡方检验用于检验变量对因变量的相关性。
假设自变量有N种取值，因变量有M种取值，考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距。
'''
SelectKBest(chi2,k=5).fit_transform(train, target_train)

# 互信息法
'''评价自变量对因变量的相关性'''
from sklearn.feature_selection import SelectKBest
from minepy import MINE

def mic(x,y):
  m = MINE()
  m.compute_score(x,y)
  return (m.mic(), 0.5)

SelectKBest(lambda X, Y: array(map(lambda x:mic(x,y), X.T)),k=2).fit_transform(train, target_train


#### Wrapper

递归特征消除法：使用一个基模型来进行多轮训练，消除若干权值系数的特征，再基于特征集进行下一轮训练。

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train, target_train)


#### Embedded

基于惩罚项的特征选择法，使用带有惩罚项的基模型，除了筛选出特征外，同时还进行了降维。

In [None]:
# 基于惩罚项的特征选择法，使用带有惩罚项的基模型，除了筛选出特征外，同时还进行了降维。
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

SelectFromModel(LogisticRegression(penalty='l1',c=0.1)).fit_transform(train, target_train)

# 基于树模型的特征选择，树模型中GBDT也可用来作为基模型进行特征选择
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

SelectFromModel(GradientBoostingClassifier()).fit_transform(train, target_train)

### 实战

本数据集中，我们删除非入模特征，对缺失值填充，计算协方差以查看特征间的相关性

In [None]:
# 删除不需要的数据
for data in [data_train, data_test]:
  data.drop(['issueDate','id'],axis=1,inplace=True)

# 纵向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0,method='ffill')

In [None]:
# 计算特征间协方差
x_train = data_train.drop(['isDefault','id'],axis=1)
data_corr = x_train.corrwith(data_train.isDefault)
result = pd.DataFrame(columns=['features','corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values

# 特征间关系可视化
data_numeric = data_train[numerical_fea]
correlation = data_numeric.corr()

f, ax = plt.subplots(figsize=(7,7))
plt.title('Correlation of Numeric Features with Price', y=1, size=16)
sns.heatmap(correlation, square=True, vmax=0.8)

In [None]:
features = [f for f in data_train.columns if f not in ['id','isDefault','issueDate'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test[features]
y_train = data_train['isDefault']


In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [None]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")

In [None]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

In [None]:
testA_result = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/testA_result.csv')

In [None]:
roc_auc_score(testA_result['isDefault'].values, lgb_test)

# 易明建模

## 准备工作

## 易明建模

In [None]:
# 使用易明软件进行建模和预测

## 结果形式调整

In [None]:
# 使用易明数据建模软件预测的结果提交
import pandas as pd
import numpy as np
result_proba_ym = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/submit_ym.csv')
print(result_proba_ym)
# result_ym = np.argmax(result_proba_ym.values, axis=1)
print(result_proba_ym.shape)

percentage = result_proba_ym['isDefault_1_percentage']

# 生成可提交的结果
result=pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/sample_submit.csv')
print(result)
result['isDefault'] = percentage
print(result)
result.to_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/submit.csv',index=False)

        isDefault_0_percentage  isDefault_1_percentage
0                     0.876752                0.123248
1                     0.639313                0.360687
2                     0.571715                0.428285
3                     0.718251                0.281749
4                     0.470049                0.529951
...                        ...                     ...
199995                0.856485                0.143515
199996                0.922308                0.077692
199997                0.791130                0.208870
199998                0.825138                0.174862
199999                0.944601                0.055399

[200000 rows x 2 columns]
(200000, 2)
            id  isDefault
0       800000        0.5
1       800001        0.5
2       800002        0.5
3       800003        0.5
4       800004        0.5
...        ...        ...
199995  999995        0.5
199996  999996        0.5
199997  999997        0.5
199998  999998        0.5
199999  999999 

In [None]:
percentage = list(result_proba_ym['isDefault_1_percentage'])
percentage_ = []
for i in range(len(percentage)):
  if percentage[i] > 0.5:
    percentage_.append(1)
  elif percentage[i] == 0.5:
    percentage_.append(1)
  else:
    percentage_.append(0)
print(percentage_)   # 生成稀疏标签分数不行



# 生成可提交的结果
result_=pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/sample_submit.csv')
print(result_)
result_['isDefault'] = percentage_
print(result_)
result_.to_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/submit.csv',index=False)


[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# lightGBM

## 数据准备

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/train.csv')
testA = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/testA.csv')

print(data.head())
print(testA.head())

   id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0   0   35000.0     5         19.52       917.97  ...   7.0  0.0  0.0  0.0  2.0
1   1   18000.0     5         18.49       461.90  ...  13.0  NaN  NaN  NaN  NaN
2   2   12000.0     5         16.99       298.17  ...  11.0  0.0  0.0  0.0  4.0
3   3   11000.0     3          7.26       340.96  ...   9.0  0.0  0.0  0.0  1.0
4   4    3000.0     3         12.99       101.07  ...  12.0  0.0  0.0  0.0  4.0

[5 rows x 47 columns]
       id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0  800000   14000.0     3         10.99       458.28  ...  17.0  0.0  0.0  1.0  3.0
1  800001   20000.0     5         14.65       472.14  ...   5.0  0.0  0.0  2.0  2.0
2  800002   12000.0     3         19.99       445.91  ...  12.0  0.0  0.0  0.0  7.0
3  800003   17500.0     5         14.31       410.02  ...  10.0  0.0  0.0  0.0  3.0
4  800004   35000.0     3         17.09      1249.42  ...  19.0  0.0  0.0  0.

In [None]:
from google.colab import data_table
data_table.DataTable(data)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# 逐一开展特征工程


In [None]:
# 划分features和labels
features = data.loc[:,testA.columns]
labels = data.loc[:,'isDefault']

# 划分X_train, X_val, y_train, y_val
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features, labels)


In [None]:
print(X_train.isnull().any())

print(X_train['employmentTitle'])



id                    False
loanAmnt              False
term                  False
interestRate          False
installment           False
grade                 False
subGrade              False
employmentTitle        True
employmentLength       True
homeOwnership         False
annualIncome          False
verificationStatus    False
issueDate             False
purpose               False
postCode               True
regionCode            False
dti                    True
delinquency_2years    False
ficoRangeLow          False
ficoRangeHigh         False
openAcc               False
pubRec                False
pubRecBankruptcies     True
revolBal              False
revolUtil              True
totalAcc              False
initialListStatus     False
applicationType       False
earliesCreditLine     False
title                  True
policyCode            False
n0                     True
n1                     True
n2                     True
n3                     True
n4                  

## 建模验证

In [None]:
import lightgbm as lgb
from sklearn.metrics import f1_score

# 自定义函数


# 用lightGBM建立Dataset
train_matrix = lgb.Dataset(X_train, label=y_train)
valid_matrix = lgb.Dataset(X_val, label=y_val)

# 设置模型参数
params = {
    'learning_rate': 0.1,
    'boosting': 'gbdt',
    'lambda_l2': 0.1,
    'max_depth': 7,
    'num_leaves': 128,
    'bagging_fraction': 0.4,
    'feature_fraction': 0.6,
    'bagging_freq': 40,
    'min_data_in_lea': 45,
    'min_child_weight': 0.001,
    'metric': None,
    'objective': 'multiclass',
    'min_split_gain': 0.1,
    'num_class': 4,
    'nthread': 10,
    'verbose': -1,
}


# 使用lightGBM进行训练
model = lgb.train(
    params,
    train_set = train_matrix,
    valid_sets = valid_matrix,
    num_boost_round = 2000,
    verbose_eval = 50,
    early_stopping_rounds = 50,
    feval = f1_score
)


ValueError: ignored

In [None]:
# 保存模型
# model.save_model("/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/Model/LightGBM_FLD.txt")

# 加载已有模型
# model = lgb.Booster(model_file="/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/Model/LightGBM_FLD.txt")

## 调参

# AutoML


## autogluon

In [None]:
!sudo pip3 install setuptools wheel



In [None]:
!sudo pip3 install autogluon

Collecting autogluon
  Downloading https://files.pythonhosted.org/packages/89/38/1669479ce6a4760cd99d2ef7ffd450213bbce65c91202339cf716917bcf8/autogluon-0.2.0-py3-none-any.whl
Collecting autogluon.extra==0.2.0
  Downloading https://files.pythonhosted.org/packages/ca/a4/1fd328c8187f15897e9b4cb7ef193bfe58aeca03a9621216291ff4f9a008/autogluon.extra-0.2.0-py3-none-any.whl
Collecting autogluon.tabular[all]==0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/10/9c/e879ef67abf232b8f582591d4ff774e60d41833332e49dc1a2d1b22e2d38/autogluon.tabular-0.2.0-py3-none-any.whl (250kB)
[K     |████████████████████████████████| 256kB 4.3MB/s 
[?25hCollecting autogluon.text==0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/fa/9b/b226cbb1574cb87149727b6b84baefd99e4b12210cecffdeb9c6a55d1e8b/autogluon.text-0.2.0-py3-none-any.whl (48kB)
[K     |████████████████████████████████| 51kB 5.9MB/s 
[?25hCollecting autogluon.features==0.2.0
[?25l  Downloading https://files.pythonhoste

In [None]:
# !pip install -U pip
# !pip install -U setuptools wheel
# python3 -m pip install -U "mxnet<2.0.0"
# !pip install autogluon  # autogluon==0.2.0
!sudo pip3 install -U "mxnet<2.0.0"
# !bash

Collecting mxnet<2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/30/07/66174e78c12a3048db9039aaa09553e35035ef3a008ba3e0ed8d2aa3c47b/mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9MB)
[K     |████████████████████████████████| 46.9MB 154kB/s 
Installing collected packages: mxnet
Successfully installed mxnet-1.8.0.post0


In [None]:
!pip show autogluon



In [None]:
# from dask.utils import stringify
!sudo pip install dask --upgrade

Collecting dask
[?25l  Downloading https://files.pythonhosted.org/packages/b9/a0/0905a1112dc3801304348ac0af0e641a2fbe12fe163ab5c3a43b2e88092d/dask-2021.5.0-py3-none-any.whl (960kB)
[K     |████████████████████████████████| 962kB 4.0MB/s 
[?25hCollecting partd>=0.3.10
  Downloading https://files.pythonhosted.org/packages/41/94/360258a68b55f47859d72b2d0b2b3cfe0ca4fbbcb81b78812bd00ae86b7c/partd-1.2.0-py3-none-any.whl
Collecting fsspec>=0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/bc/52/816d1a3a599176057bf29dfacb1f8fadb61d35fbd96cb1bab4aaa7df83c0/fsspec-2021.5.0-py3-none-any.whl (111kB)
[K     |████████████████████████████████| 112kB 29.6MB/s 
Collecting locket
  Downloading https://files.pythonhosted.org/packages/50/b8/e789e45b9b9c2db75e9d9e6ceb022c8d1d7e49b2c085ce8c05600f90a96b/locket-0.2.1-py2.py3-none-any.whl
Installing collected packages: locket, partd, fsspec, dask
  Found existing installation: dask 2.12.0
    Uninstalling dask-2.12.0:
      Successfully uni

In [2]:
%cd "/content/drive/Shareddrives/xucc1993.HK(CRN.NGO)/Forecast_of_Loan_Default/AutoML-AutoDL/agModels-predictClass"

/content/drive/Shareddrives/xucc1993.HK(CRN.NGO)/Forecast_of_Loan_Default/AutoML-AutoDL/agModels-predictClass


In [None]:
!pwd

/content/drive/Shareddrives/xucc1993.HK(CRN.NGO)/Forecast_of_Loan_Default/AutoML-AutoDL/agModels-predictClass


In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor
TRAIN_DATA_path = '/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/train.csv'
TEST_DATA_path = '/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/testA.csv'
train_data = TabularDataset(TRAIN_DATA_path)
test_data = TabularDataset(TEST_DATA_path)
id, label = 'id', 'isDefault' 
# save_path = '/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/Result_AutoGluon_y_val.csv'
# predictor = TabularPredictor(label=label).fit(train_data=train_data)

# predictions = predictor.predict(test_data)
# print(predictions)

In [6]:
# print(train_data.isnull().any())
print(train_data.shape)

(800000, 47)


In [7]:

Train_data = train_data[:650000].copy()
Val_data = train_data[650000:].copy()

X_Train_data = Train_data.drop(columns=[label])
y_Train_data = Train_data[label]
X_Val_data = Val_data.drop(columns=[label])
y_Val_data = Val_data[label]

metric = 'accuracy'

In [13]:
predictor = TabularPredictor(label=label).fit(train_data=Train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20210518_023824/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210518_023824/"
AutoGluon Version:  0.2.0
Train Data Rows:    650000
Train Data Columns: 46
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12505.75 MB
	Train Data (Original)  Memory Usage: 415.87 MB (3.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtyp

In [None]:
import autogluon.core as ag

nn_options = {  # specifies non-default hyperparameter values for neural network models
    'num_epochs': 10,  # number of training epochs (controls training time of NN models)
    'learning_rate': ag.space.Real(1e-4, 1e-2, default=5e-4, log=True),  # learning rate used in training (real-valued hyperparameter searched on log-scale)
    'activation': ag.space.Categorical('relu', 'softrelu', 'tanh'),  # activation function used in NN (categorical hyperparameter, default = first entry)
    'layers': ag.space.Categorical([100], [1000], [200, 100], [300, 200, 100]),  # each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use
    'dropout_prob': ag.space.Real(0.0, 0.5, default=0.1),  # dropout probability (real-valued hyperparameter)
}

gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'num_boost_round': 100,  # number of boosting rounds (controls training time of GBM models)
    'num_leaves': ag.space.Int(lower=26, upper=66, default=36),  # number of leaves in trees (integer hyperparameter)
}

hyperparameters = {  # hyperparameters of each model type
                   'GBM': gbm_options,
                   'NN': nn_options,  # NOTE: comment this line out if you get errors on Mac OSX
                  }  # When these keys are missing from hyperparameters dict, no models of that type are trained
 
time_limit = 2*60  # train various models for ~2 min
num_trials = 5  # try at most 5 different hyperparameter configurations for each type of model
search_strategy = 'auto'  # to tune hyperparameters using Bayesian optimization routine with a local scheduler

hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
    'num_trials': num_trials,
    'scheduler' : 'local',
    'searcher': search_strategy,
}

predictor = TabularPredictor(label=label, eval_metric=metric).fit(
    Train_data, tuning_data=Val_data, time_limit=time_limit,
    hyperparameters=hyperparameters, hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
)

No path specified. Models will be saved in: "AutogluonModels/ag-20210518_023222/"
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "AutogluonModels/ag-20210518_023222/"
AutoGluon Version:  0.2.0
Train Data Rows:    650000
Train Data Columns: 46
Tuning Data Rows:    150000
Tuning Data Columns: 46
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
NumExpr defaulting to 2 threads.
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12766.18 MB
	Train Data (Original)  Memory Usage: 511.84 MB (4.0% of available memory)
	Inferring dat

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

	Time limit exceeded
Hyperparameter tuning model: NeuralNetMXNet ...





HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

	Time limit exceeded
No base models to train on, skipping weighted ensemble...


ValueError: ignored

In [28]:
from autogluon.tabular import TabularDataset, TabularPredictor
predictor = TabularPredictor.load("/content/drive/Shareddrives/xucc1993.HK(CRN.NGO)/Forecast_of_Loan_Default/AutoML-AutoDL/agModels-predictClass/AutogluonModels/ag-20210518_023824/")
predictions = predictor.predict_proba(test_data)
print(predictions)

               0         1
0       0.727644  0.272356
1       0.619737  0.380263
2       0.454637  0.545363
3       0.604883  0.395117
4       0.568309  0.431691
...          ...       ...
199995  0.679267  0.320733
199996  0.737980  0.262020
199997  0.639640  0.360360
199998  0.657937  0.342063
199999  0.743625  0.256375

[200000 rows x 2 columns]


In [26]:
# predictions.iloc[:,1]
y_Val_data.shape

(150000,)

In [None]:
# list(predictions)
list(y_Val_data)
import sklearn
Score_acc = sklearn.metrics.roc_auc_score(predictions.iloc[:,1], y_Val_data)
print(Score_acc)

In [29]:
import pandas as pd
# 生成可提交的结果
result=pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/sample_submit.csv')
print(result)
result['isDefault'] = list(predictions.iloc[:,1])
print(result)
result.to_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Result_Submission/submit_0518.csv',index=False)

            id  isDefault
0       800000        0.5
1       800001        0.5
2       800002        0.5
3       800003        0.5
4       800004        0.5
...        ...        ...
199995  999995        0.5
199996  999996        0.5
199997  999997        0.5
199998  999998        0.5
199999  999999        0.5

[200000 rows x 2 columns]
            id  isDefault
0       800000   0.272356
1       800001   0.380263
2       800002   0.545363
3       800003   0.395117
4       800004   0.431691
...        ...        ...
199995  999995   0.320733
199996  999996   0.262020
199997  999997   0.360360
199998  999998   0.342063
199999  999999   0.256375

[200000 rows x 2 columns]


## auto-sklearn

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/train.csv')
testA = pd.read_csv('/content/drive/MyDrive/Other_Projects/Forecast_of_loan_default/Data/testA.csv')

print(data.head())
print(testA.head())

   id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0   0   35000.0     5         19.52       917.97  ...   7.0  0.0  0.0  0.0  2.0
1   1   18000.0     5         18.49       461.90  ...  13.0  NaN  NaN  NaN  NaN
2   2   12000.0     5         16.99       298.17  ...  11.0  0.0  0.0  0.0  4.0
3   3   11000.0     3          7.26       340.96  ...   9.0  0.0  0.0  0.0  1.0
4   4    3000.0     3         12.99       101.07  ...  12.0  0.0  0.0  0.0  4.0

[5 rows x 47 columns]
       id  loanAmnt  term  interestRate  installment  ...   n10  n11  n12  n13  n14
0  800000   14000.0     3         10.99       458.28  ...  17.0  0.0  0.0  1.0  3.0
1  800001   20000.0     5         14.65       472.14  ...   5.0  0.0  0.0  2.0  2.0
2  800002   12000.0     3         19.99       445.91  ...  12.0  0.0  0.0  0.0  7.0
3  800003   17500.0     5         14.31       410.02  ...  10.0  0.0  0.0  0.0  3.0
4  800004   35000.0     3         17.09      1249.42  ...  19.0  0.0  0.0  0.

In [None]:
features_train = data.drop(columns='isDefault', axis=1)
labels_train = data['isDefault']
X_test = testA

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features_train, labels_train, test_size=0.2, random_state=511)


In [None]:
# import autosklearn.classification
from autosklearn.experimental.askl2 import AutoSklearn2Classifier

cls = AutoSklearn2Classifier()
cls.fit(X_train, y_train)
predictions = cls.predict(X_val)

from sklearn.metrics import auc
# score_auc = auc(x, y)

IncorrectPackageVersionError: ignored

In [None]:
!pip show scikit-learn

Name: scikit-learn
Version: 0.24.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: threadpoolctl, scipy, joblib, numpy
Required-by: yellowbrick, textgenrnn, smac, sklearn, sklearn-pandas, mlxtend, lightgbm, librosa, imbalanced-learn, fancyimpute, auto-sklearn


In [None]:
!pip3 install auto-sklearn

## autokeras

In [None]:
!pip3 show autokeras

Name: autokeras
Version: 1.0.12
Summary: AutoML for deep learning
Home-page: http://autokeras.com
Author: Data Analytics at Texas A&M (DATA) Lab, Keras Team
Author-email: jhfjhfj1@gmail.com
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: keras-tuner, scikit-learn, packaging, tensorflow, pandas
Required-by: 
