## 五分类

In [1]:
import numpy as np
import pandas as pd

In [2]:
# np.loadtxt(fname='companies.csv', delimiter=',', skiprows=1)
# 以上是使用numpy进行数据导入代码，但是数据实际使用前需要重新编制数据指标，即重新形成数据features（在深度学习框架中，即inputs
# 使用pandas导入数据形成DataFrame能够更方便处理数据
# 数据导入

companies_data = pd.read_excel(io='companies.xls', sheet_name=0)

In [3]:
companies_data.head(10)

Unnamed: 0,ASSET,LIBITY,CASSET,CLIBITY,CASH,SREVENUE,SPROFIT,NPROFIT,RECEIVAB,MAINCOST,INVENTRY,LOAN,INTEREST,RATING
0,15473360000.0,10599250000.0,6835456000.0,3197114000.0,2039102000.0,16418980000.0,4895354000.0,745486500.0,88736800.0,11523630000.0,2860091000.0,3636876000.0,185283400.0,1
1,56806120000.0,39707480000.0,20854140000.0,16683310000.0,12897890000.0,3511169000.0,1814890000.0,759829600.0,2265957000.0,1696280000.0,869261500.0,1561541000.0,77004090.0,1
2,8345341000.0,5833393000.0,804786900.0,643829500.0,767512700.0,2031876000.0,1407197000.0,870933600.0,0.0,624678400.0,215753500.0,462385300.0,22801550.0,1
3,7926957000.0,4439096000.0,1679505000.0,2362389000.0,417219400.0,1187910000.0,553308900.0,128427900.0,1136382000.0,634601500.0,26678390.0,2764960000.0,136348100.0,1
4,95964280000.0,65735530000.0,32481510000.0,30897050000.0,3929240000.0,21662340000.0,2408890000.0,914591300.0,2869013000.0,19253450000.0,6586292000.0,17658770000.0,899639500.0,1
5,63298630000.0,43359560000.0,8042116000.0,14612580000.0,1901062000.0,26831850000.0,5915800000.0,3175083000.0,3269928000.0,20916050000.0,1540905000.0,23628760000.0,1203785000.0,1
6,7873401000.0,5393280000.0,569967500.0,2341210000.0,559345500.0,891033200.0,578046000.0,355575400.0,0.0,312987200.0,26165370.0,1211009000.0,61695770.0,1
7,24324240000.0,16662100000.0,10417380000.0,7579921000.0,2737957000.0,17533130000.0,1162659000.0,766749200.0,53071260.0,16370470000.0,4213021000.0,4756771000.0,242337300.0,1
8,813772900.0,290952400.0,478344800.0,290797200.0,84184960.0,596397900.0,41631510.0,7056628.0,61360650.0,244226400.0,144226400.0,70919700.0,2873910.0,3
9,17076520000.0,11697420000.0,2196964000.0,1447596000.0,708840400.0,1102394000.0,696729100.0,577633700.0,1136930000.0,405664900.0,23847500.0,829270700.0,40893730.0,1


In [4]:
# 使用深度学习算法放入进行分类，在这里尝试不再将按照五级分类法分成的结果在进行违约、非违约级的划分

data = companies_data.drop(labels='RATING', axis=1)
labels = companies_data['RATING']

In [5]:
# 跳过探索性分析，直接进入指标编制阶段

features = pd.DataFrame(columns=['x%s' % i for i in range(1,19)])
features['x1'] = data['LIBITY'] / data['ASSET'] # 总负债/总资产
features['x2'] = data['SREVENUE'] / data['INTEREST'] # 销售收入/利息费用
features['x3'] = data['CASSET'] / data['CLIBITY'] # 流动资产/流动负债
features['x4'] = data['NPROFIT'] / (data['ASSET'] - data['LIBITY']) # 净利润/净资产
features['x5'] = data['SREVENUE'] / data['CASH'] # 销售收入/现金
features['x6'] = np.log(data['ASSET']) # 总资产的对数
features['x7'] = data['SREVENUE'] / data['ASSET'] # 销售收入/总资产
features['x8'] = data['SPROFIT'] / data['ASSET'] # 销售利润/总资产
features['x9'] = (data['SREVENUE'] - data['SPROFIT']) / data['SPROFIT'] # 销售成本/销售收入
features['x10'] = (data['RECEIVAB'] + data['INVENTRY']) / (data['ASSET'] - data['LIBITY']) # (应收账款+存货)/净资产
features['x11'] = data['INVENTRY'] / (data['ASSET'] - data['LIBITY']) # 存货/净资产
features['x12'] = data['SREVENUE'] / data['LIBITY'] # 销售收入/总负债
features['x13'] = data['CASSET'] / (data['ASSET'] - data['LIBITY']) # 流动资产/净资产
features['x14'] = data['SPROFIT'] / data['INTEREST'] # 销售利润/利息费用
features['x15'] = data['SREVENUE'] / data['CASSET'] # 销售收入/流动资产
features['x16'] = data['SREVENUE'] / (data['ASSET'] - data['LIBITY']) # 销售收入/净资产
features['x17'] = data['CASSET'] / data['INTEREST'] # 流动资产/利息费用
features['x18'] = data['MAINCOST'] / data['SREVENUE'] # 主营业务成本/销售收入

In [6]:
features.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18
0,0.685,88.615529,2.138008,0.152948,8.052067,23.462385,1.061113,0.316373,2.353993,0.604998,0.586793,1.54907,1.402402,26.420905,2.402032,3.368613,36.891905,0.701848
1,0.699,45.597179,1.25,0.044438,0.272228,24.76291,0.06181,0.031949,0.934646,0.183361,0.050838,0.088426,1.219637,23.568743,0.168368,0.205348,270.818555,0.48311
2,0.699,89.111301,1.25,0.346716,2.647351,22.844969,0.243474,0.168621,0.443917,0.085891,0.085891,0.348318,0.320384,61.714986,2.524738,0.808885,35.295269,0.307439
3,0.56,8.712333,0.710935,0.036821,2.847208,22.793535,0.149857,0.069801,1.146921,0.333459,0.007649,0.267602,0.481528,4.05806,0.707298,0.340584,12.317767,0.534217
4,0.685,24.078909,1.051282,0.030256,5.513112,25.287242,0.225733,0.025102,7.992666,0.312792,0.217882,0.329538,1.074524,2.677616,0.666913,0.716614,36.10503,0.888798


In [7]:
# 使用sklearn库进行训练集、测试集分割

from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.4, random_state=23)

In [8]:
# 预备工作完成，下面使用深度学习方法进行分类模型训练和测试
# 使用mxnet作为深度学习框架

from mxnet import gluon, init, nd, autograd
from mxnet.gluon import loss as gloss, data as gdata, nn

In [9]:
batch_size = 180

train_set = gdata.ArrayDataset(nd.array(train_features), nd.array(train_labels))
test_set = gdata.ArrayDataset(nd.array(test_features), nd.array(test_labels))
train_iter = gdata.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [10]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0., 0
    for X, y in data_iter:
        y_hat = net(X)
        y = y.astype('float32')
        acc_sum += (y_hat.argmax(axis=1) == y-1).sum().asscalar()
        n += y.size
    return acc_sum / n
    
def train_display(net, train_iter, test_iter, loss, num_epochs, batch_size, trainer, params=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0., 0., 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y-1).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y-1).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
    print('epoch %d, loss %.4f, train acc %.3f, test ass %.3f' % (epoch+1, train_l_sum / n, train_acc_sum / n, test_acc))       

In [11]:
# 模型初始化

net = nn.Sequential()
net.add(
       nn.Dense(3, activation='relu'),  # 隐藏层
       nn.Dense(5))                       # 输出层
net.initialize(init.Normal(sigma=0.01))

In [12]:
# 损失函数

loss = gloss.SoftmaxCrossEntropyLoss()   # 使用softmax -> cross entrophy来构建损失函数
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.000003})   # 使用梯度下降法进行反向传播
num_epochs = 5000
train_display(net=net, train_iter=train_iter, test_iter=test_iter, loss=loss, num_epochs=num_epochs, batch_size=batch_size, trainer=trainer)

epoch 5000, loss 1.4848, train acc 0.424, test ass 0.429


In [13]:
# 由于数据量太小，分类太多，结果并不好。使用机器学习算法效果更好。

## 两分类

In [14]:
# 根据银保监会贷款风险分级，RATING共分为5级，现将标记为1、2的企业认定为非违约级，3、4、5认定为违约级

labels_2 = pd.Series([0 if any([i ==1, i ==2]) else 1 for i in labels])

In [15]:
train_features_2, test_features_2, train_labels_2, test_labels_2 = train_test_split(features, labels_2, test_size=0.4, random_state=23)

In [16]:
train_set_2 = gdata.ArrayDataset(nd.array(train_features_2), nd.array(train_labels_2))
test_set_2 = gdata.ArrayDataset(nd.array(test_features_2), nd.array(test_labels_2))
train_iter_2 = gdata.DataLoader(train_set_2, batch_size=batch_size, shuffle=True)
test_iter_2 = gdata.DataLoader(test_set_2, batch_size=batch_size, shuffle=True)

In [17]:
def evaluate_accuracy_2(data_iter, net):
    acc_sum, n = 0., 0
    for X, y in data_iter:
        y_hat = net(X)
        y = y.astype('float32')
        acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
        n += y.size
    return acc_sum / n
    
def train_display_2(net, train_iter, test_iter, loss, num_epochs, batch_size, trainer, params=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0., 0., 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy_2(test_iter, net)
    print('epoch %d, loss %.4f, train acc %.3f, test ass %.3f' % (epoch+1, train_l_sum / n, train_acc_sum / n, test_acc))   

In [18]:
net_2 = nn.Sequential()
net_2.add(
       nn.Dense(4, activation='relu'),  # 隐藏层
       nn.Dense(5))                       # 输出层
net_2.initialize(init.Normal(sigma=0.01))

In [19]:
loss_2 = gloss.SoftmaxCrossEntropyLoss()   # 使用softmax -> cross entrophy来构建损失函数
trainer_2 = gluon.Trainer(net_2.collect_params(), 'sgd', {'learning_rate':0.0003})   # 使用梯度下降法进行反向传播
train_display_2(net=net_2, train_iter=train_iter_2, test_iter=test_iter_2, loss=loss_2, num_epochs=10000, batch_size=150, trainer=trainer_2)

epoch 10000, loss 0.2267, train acc 0.927, test ass 0.903


In [20]:
# Done

In [21]:
# 参数
# 隐藏层参数

d0 = net_2[0]
W0 = d0.weight.data()
b0 = d0.bias.data()

In [22]:
print(W0, b0)


[[-8.2120271e-03  4.5904938e-02  2.1774584e-02 -4.1202935e-03
   3.8836718e-02 -3.6159895e-02  2.7353175e-02  1.2638944e-02
   2.3613335e-01  1.5638592e-03  6.9922456e-03  5.9595194e-02
  -7.2111674e-03 -2.7258791e-02  2.0816064e-02 -4.1226018e-02
   3.6391951e-02  1.0786835e-03]
 [-2.4935812e-02  2.5448151e-02 -6.1645498e-03  9.7552240e-03
  -1.4685456e-01 -1.3761872e-03  7.2115827e-03  1.8842973e-02
  -5.9175643e-04  3.6368889e-03  1.2858353e-02  8.5489126e-03
   1.3014262e-02  1.2571070e-01  1.2007159e-02 -2.7220491e-02
  -4.2778142e-02  2.5454203e-03]
 [-1.4670915e-02  4.8617937e-02  3.1142283e-02 -2.4586940e-02
   7.0433840e-02 -5.1853709e-02  9.2192115e-03 -5.2540964e-03
   3.1222349e-01  2.3328101e-03  3.5978624e-04  6.8734080e-02
   3.2953839e-03 -7.4401774e-02  4.2218577e-02 -7.5513572e-02
   5.5032920e-02 -5.5257785e-03]
 [ 4.5469955e-02  5.8949865e-02 -2.8883962e-02  3.5347531e-03
   1.2122229e-02  4.7161183e-01  6.4882934e-03 -5.1686545e-03
  -1.5377499e-01  2.0562587e-03 

In [23]:
# 输出层

d1 = net_2[1]
W1 = d1.weight.data()
b1 = d1.bias.data()

In [24]:
print(W1, b1)


[[-0.07778171 -0.1129759  -0.09522577  0.4023443 ]
 [ 0.10366192  0.11382196  0.15748496  0.15435159]
 [-0.01133155 -0.00135357 -0.00852243 -0.17531666]
 [ 0.00869439  0.00549644  0.00360114 -0.18186778]
 [-0.00501886 -0.00505365 -0.00097281 -0.17796955]]
<NDArray 5x4 @cpu(0)> 
[ 0.0926943  -0.0214239  -0.02455131 -0.02287426 -0.02384446]
<NDArray 5 @cpu(0)>


In [25]:
net_2.save_parameters('./parameters')