# 五、神经网络算法模型 

## 1. 引用包

In [219]:
# Let's import some basic libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Pytorch专用包
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from sklearn.model_selection import cross_val_score

## 2. 数据处理 

In [220]:
### （1）读数据和简单筛选 ###

# 1. 从⽂文件中读取原始数据
file_spread = '../数据集/债明细-生数据集.xlsx'           ###
df=pd.read_excel(file_spread)

# 2. 按照指标分别删除0行
titles = ['ROE', 'ROA', 'ROS', 'ROIC', '总资产周转率','流动资产周转率','存货周转率','应收账款周转率','资产负债率','流动比率','速动比率','现金比率','利息保障倍数','销售收入增长率','净利润增长率','总资产增长率','经营活动现金流净额占比','投资活动现金流净额占比','筹资活动现金流净额占比','国内生产总值','居民消费价格指数增长率','一般预算收入','spread','scoring']
titles2 = ['ROE', 'ROA', 'ROS', 'ROIC', '总资产周转率','流动资产周转率','存货周转率','应收账款周转率','资产负债率','流动比率','速动比率','现金比率','利息保障倍数','销售收入增长率','净利润增长率','总资产增长率','经营活动现金流净额占比','投资活动现金流净额占比','筹资活动现金流净额占比','国内生产总值','居民消费价格指数增长率','一般预算收入','spread']
for t in titles:
    if t != '居民消费价格指数增长率':
        df = df[df[t]!=0]

# 3. 删除不必要
df.drop(['发行人','发行规模','债券余额','发行期限','票面利率(发行参考)','上市日期','起息日','到期日','发行人省份','城市','起息日期', '债券简称', '债券类型', '债券简称', '债券类型', 'Unnamed: 38', '年度标识', '城市名称', '城市代码', '城市类别', '省份名称','国内生产总值（亿元）',
       '居民消费价格指数（上年＝100）', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 
       'Unnamed: 49', 'Unnamed: 50', 'spread平均值', 'Unnamed: 52', 'Unnamed: 53','Unnamed: 54','Platform Rating'],axis=1,inplace=True)

# 4. df乱序
df = df.reindex(np.random.permutation(df.index))
#df_ori = df  # 备份原始数据

In [221]:
# 5. 连续值Scaling
# 对连续变量进行缩放，即以零均值和标准偏差为1来标准化它们。这样我们就可以节省各种平均值和标准偏差，以便以后转换回来。
scaled_features = {}
for t in titles:
    mean, std = df[t].mean(), df[t].std()
    scaled_features[t] = [mean, std]
    df.loc[:, t] = (df[t] - mean)/std

print(scaled_features)

{'ROE': [2.146552662473788, 7.3336990077381605], 'ROA': [1.9186391334731043, 1.7969391942590844], 'ROS': [19.874540153738668, 18.588974838499702], 'ROIC': [1.897528134171907, 2.0545044911615005], '总资产周转率': [0.1182046680642905, 0.163816988668333], '流动资产周转率': [0.31141119496855324, 0.4486891511443836], '存货周转率': [23.409254325646323, 405.74917304064456], '应收账款周转率': [72.22729818308873, 1454.5876974870646], '资产负债率': [1.991765259818022, 1.944187100608632], '流动比率': [2.972594493361288, 2.6345328034135544], '速动比率': [1.3909536967155853, 1.0646160189941907], '现金比率': [0.4246323950518437, 0.35051438565864496], '利息保障倍数': [22.371953039832235, 338.8448301300983], '销售收入增长率': [81.96843127882543, 2935.731388163599], '净利润增长率': [36.365267742497885, 262.63413167058604], '总资产增长率': [12.271396242568613, 16.189351689966376], '经营活动现金流净额占比': [-299.35768549413166, 19021.0482256977], '投资活动现金流净额占比': [-205.71987177665002, 7107.248287720758], '筹资活动现金流净额占比': [605.0775572707787, 22156.73583971834], '国内生产总值': [9204.2786573

## 3. 数据集准备 

In [222]:
# x
factors = df[titles[:23]].values.tolist()
x = pd.DataFrame(factors)

# y
scoring = df[titles[23]].values.tolist()
y = pd.DataFrame(scoring)

# redirection dataset
re_features = x
re_targets = y

# split <train, test>
train_features, x_test, train_targets, y_test = train_test_split(x, y, test_size=0.11)

# 将test set再划分
validation_features, test_features, validation_targets, test_targets = train_test_split(x_test, y_test, test_size=0.25)

In [223]:
l_rate = 0.2
mse_loss = nn.MSELoss(reduction = 'mean')

In [224]:
df.columns
print(df[titles[23]])

158    -0.689268
6714    1.649515
8725    0.090326
1059    0.090326
5540   -0.689268
          ...   
2866   -0.689268
3584    0.090326
6436    0.090326
8589   -0.689268
2416   -0.689268
Name: scoring, Length: 7155, dtype: float64


In [225]:
class Regression(pl.LightningModule):
    
### The Model ### 

    # Question: what will your model architecture look like?
    # Initialize the layers
    # Here we have one input layer (size 23 as we have 23 features), one hidden layer (size 10), 
    # and one output layer (size 1 as we are predicting a single value)
    def __init__(self):
        super(Regression, self).__init__()
        self.fc1 = nn.Linear(23,10)
        self.fc2 = nn.Linear(10, 4)
        self.fc3 = nn.Linear(4, 2)
        self.fc4 = nn.Linear(2, 1)
    
    # Question: how should the forward pass be performed, and what will its ouputs be?
    # Perform the forward pass
    # We're using the sigmoid activation function on our hidden layer, but our output layer has no activation 
    # function as we're predicting a continuous variable so we want the actual number predicted
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        x = self.fc4(x)
        return x

### The Data Loaders ###     
    
    # Define functions for data loading: train / validate / test / re-direction
    def train_dataloader(self):
        train_dataset = TensorDataset(torch.tensor(train_features.values).float(), torch.tensor(train_targets.values).float())
        train_loader = DataLoader(dataset = train_dataset, batch_size = 128)
        return train_loader
        
    def val_dataloader(self):
        validation_dataset = TensorDataset(torch.tensor(validation_features.values).float(), torch.tensor(validation_targets.values).float())
        validation_loader = DataLoader(dataset = validation_dataset, batch_size = 128)
        return validation_loader
    
    ### def test_dataloader(self):
    ### test_dataset = TensorDataset(torch.tensor(test_features.values).float(), torch.tensor(test_targets.values).float())
    ### test_loader = DataLoader(dataset = test_dataset, batch_size = 128)
    ### return test_loader
    
    def test_dataloader(self):
        re_dataset = TensorDataset(torch.tensor(re_features.values).float(), torch.tensor(re_targets.values).float())
        re_loader = DataLoader(dataset = re_dataset, batch_size = 128)
        return re_loader

### The Optimizer ### 

    # Question: what optimizer will I use?
    # Define optimizer function: here we are using Stochastic Gradient Descent
    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=l_rate)

### Training ### 

    # Define training step
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        # Add logging
        logs = {'loss': loss}
        return {'loss': loss, 'log': logs}

### Validation ### 
    
    # Define validation step
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        return {'val_loss': loss}

    # Define validation epoch end
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

### Testing ###     

    # Define test step
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = mse_loss(logits, y)
        correct = torch.sum(logits == y.data)
        
        # I want to visualize my predictions vs my actuals so here I'm going to add these lines to extract the data for plotting later on
        predictions_pred.append(logits)
        predictions_actual.append(y.data)
        return {'test_loss': loss, 'test_correct': correct, 'logits': logits}
    
    # Define test end
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        logs = {'test_loss': avg_loss}      
        return {'avg_test_loss': avg_loss, 'log': logs, 'progress_bar': logs }
        

In [226]:
model = Regression()
trainer = Trainer(max_epochs = 50)  # Option to check if it's working at all: fast_dev_run=True 
                                    # Option to implement early stopping: early_stop_callback=True
trainer.fit(model)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 240   
1 | fc2  | Linear | 44    
2 | fc3  | Linear | 10    
3 | fc4  | Linear | 3     
--------------------------------
297       Trainable params
0         Non-trainable params
297       Total params
0.001     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Please use self.log(...) inside the lightningModule instead.
# log on a step or aggregate epoch metric to the logger and/or progress bar (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [227]:
# 1. 查看我创建的神经网络结构
model.eval()

# 2. 两个list分别接收测试的【预测值】和【实际值】
predictions_pred = []
predictions_actual = []
predictions_pred2 = []
predictions_actual2 = []

# 3. 查看损失
trainer.test()



Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'avg_test_loss': 0.47027716040611267, 'test_loss': 0.47027716040611267}
--------------------------------------------------------------------------------


Please use self.log(...) inside the lightningModule instead.
# log on a step or aggregate epoch metric to the logger and/or progress bar (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


[{'test_loss': 0.47027716040611267, 'avg_test_loss': 0.47027716040611267}]

In [228]:
# 4. 验证准确度
# 将scoring测试数据转变回正常的纬度
mean = scaled_features['scoring'][0]
std = scaled_features['scoring'][1]

# Get predicted points (scaled back to their original size)
plot_pred = []
for i in range(len(predictions_pred)):
    plot_pred.extend(predictions_pred[i].T.numpy()[0] * std + mean)

# Get actual points (scaled back to their original size)
plot_actual = []
for i in range(len(predictions_actual)):
    plot_actual.extend(predictions_actual[i].T.numpy()[0] * std + mean)

# (1)对预测结果四舍五入，再与真实值进行比较
# (2)直接与真实值比较
correct = 0
correct2 = 0
for k in range(len(plot_pred)):
    if (round(plot_pred[k]) == plot_actual[k]) or (round(plot_pred[k]) == plot_actual[k]+1) or (round(plot_pred[k]) == plot_actual[k]-1):
        correct += 1
    if (round(plot_pred[k]) == plot_actual[k]):
        correct2 += 1
acc = correct / len(plot_pred)
acc2 = correct2 / len(plot_pred)

print("模糊准确率: ", acc)
print("精确准确率: ", acc2)

模糊准确率:  0.906638714185884
精确准确率:  0.46820405310971347


# 4. 层级反射

In [229]:
# 4. 验证准确度222
# 将scoring测试数据转变回正常的纬度
mean = scaled_features['scoring'][0]
std = scaled_features['scoring'][1]

# Get predicted points (scaled back to their original size)
plot_pred = []
for i in range(len(predictions_pred)):
    plot_pred.extend(predictions_pred[i].T.numpy()[0] * std + mean)

# Get actual points (scaled back to their original size)
plot_actual = []
for i in range(len(predictions_actual)):
    plot_actual.extend(predictions_actual[i].T.numpy()[0] * std + mean)

# 是预测结果的一级分类
first_layer = []
# 对预测结果四舍五入，再与真实值进行比较
correct = 0
for k in range(len(plot_pred)):
    if (round(plot_pred[k]) == plot_actual[k]) or (round(plot_pred[k]) == plot_actual[k]+1) or (round(plot_pred[k]) == plot_actual[k]-1):
        correct += 1
    if (round(plot_pred[k]) == 22) or (round(plot_pred[k]) == 21):
        first_layer.append(1)
    if (round(plot_pred[k]) == 20) or (round(plot_pred[k]) == 19) or (round(plot_pred[k]) == 18):
        first_layer.append(2)
    if (round(plot_pred[k]) == 17) or (round(plot_pred[k]) == 16) or (round(plot_pred[k]) == 15):
        first_layer.append(3)
    
acc = correct / len(plot_pred)
print(acc)

#print(plot_pred)
#print(first_layer)

0.906638714185884


In [230]:
x['result'] = df_ori[titles[23]].values.tolist()
x['first_layer'] = first_layer
print(x)

             0         1         2         3         4         5         6  \
0     0.336903  0.998120  1.629055  0.940505 -0.268621 -0.517755 -0.057535   
1    -0.190484  0.490924  0.097195  0.130334  0.180051  1.501460 -0.050065   
2    -0.267730 -0.220063  3.540112 -0.748126 -0.567735 -0.513075 -0.057657   
3     0.019233 -0.466370 -0.230612 -0.436761  0.104356 -0.301793 -0.057181   
4    -0.056595 -0.635714 -0.292746 -0.490205 -0.488378 -0.514858 -0.056966   
...        ...       ...       ...       ...       ...       ...       ...   
7150 -0.106925 -0.323182 -0.171771 -0.172561 -0.379720 -0.568124 -0.057534   
7151 -0.232564 -0.857981 -0.721102 -0.777525 -0.377279 -0.405428 -0.056433   
7152  0.586927  1.532083  2.110959  0.831476 -0.323560 -0.393616 -0.057505   
7153  0.035187 -0.193184  0.233206 -0.310064 -0.401086 -0.563890 -0.057526   
7154  0.063262 -0.440103 -0.546321 -0.287236 -0.502418 -0.610470 -0.057569   

             7         8         9  ...        15        16    

In [231]:
AAA = x.query('first_layer == 1')
AA = x.query('first_layer == 2')
A = x.query('first_layer == 3')
print( AA['result'])

0      -1.468863
2       1.649515
3      -0.689268
4      -0.689268
5       0.090326
          ...   
7150    0.090326
7151   -0.689268
7152   -0.689268
7153    1.649515
7154   -0.689268
Name: result, Length: 6766, dtype: float64


In [232]:
# 2. 分割数据集，一部分测试一部分训练，数据集的分割，分为训练集和测试集
tmp1 = AA[range(23)].values.tolist()
tmp2 = AA['result'].values.tolist()
x_train, x_test, y_train, y_test = train_test_split(pd.DataFrame(tmp1), pd.DataFrame(tmp2), test_size=0.25)
dict = DictVectorizer(sparse=False)                             # 进行特征工程的处理，将数据集变换成one-hot形式
x_train = dict.fit_transform(x_train.to_dict(orient="records")) # 将列表转换为一个个字典形式，因为DictVectorizer(）中接收的是字典形式
print("feature name: ",dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient="records"))
tre=DecisionTreeClassifier()  # 用决策树预测
tre.fit(x_train.astype('int'),y_train.astype('int'))

# 3. 输出准确度
#cross_val_score(tre, x_test, y_test.astype('int'), cv=10)
tre.score(x_test, y_test.astype('int'))


feature name:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


0.6052009456264775