# 模型建立

## 介绍：思路解析

在本文中，我们收集了2013-2017年间新股发行的数据如公司上市前资产负债及经营情况、上市首日的交易情况等，再对数据进行了一系列的预处理，得到了完整的数据集。经过统计描述的分析，发现我国股市的抑价现象严重，近几年间几乎不存在无抑价现象的新股。因此将抑价率高于均值的新股标记为1，低于均值的新股标记为0，得到了新的二分类变量并作为了目标变量，研究哪些因素会导致新股发行的抑价率明显高于一般水平及各影响因素的重要性。本文中使用了多种机器学习的模型，在表现效果较好的模型中，大部分为决策树及决策树改进的模型。

本文中大部分的数据来自于CSMAR数据库，上证指数来自于tushare财经数据库
- Stkcd：股票代码
- Total assets：总资产（上市前最后一个会计统计期间）
- Total equity：总负债（上市前最后一个会计统计期间）
- Sales：总营业收入（上市前最后一个会计统计期间）
- Operating profit：营业利润（上市前最后一个会计统计期间）
- Net income：净利润（上市前最后一个会计统计期间）
- Cash flow from operations：现金流量（上市前最后一个会计统计期间）
- date：上市日期
- One day excess return：上市首日日回报率
- offer price：股票发行价格
- Sales method：股票的发行方式：
- Underwriting method：股票承销方式
- number of shares sold：股票销售数量
- inception date：公司上市距成立的日期
- US dollar buying rate：汇率

## 导入数据

In [1]:
#pandas和numpy用于数据的处理
import pandas as pd
import numpy as np
import re

#matplotlib用于数据的可视化
import matplotlib.pyplot as plt 
import matplotlib
%matplotlib inline

#忽略pandsa的警告
import warnings
warnings.filterwarnings('ignore')

#为了画图更好看
import seaborn as sns  
sns.set( palette="muted", color_codes=True)  

In [2]:
#导入已经集成好的数据
ipo=pd.read_csv("IPO.csv")

In [3]:
#查看数据
ipo.head()

Unnamed: 0,Stkcd,Total assets,Total debts,Cash flow from operations,Sales,Operating profit,Net income,One day excess return,Offer price,number of shares sold,ROA1,ROA2,Total assets turnover rate,Debt ratio,Firm age,Issue proceeds(USD),year,Stock market sentiment,Sales method
0,2705,3390104000.0,2068954000.0,559246300.0,5041712000.0,244715700.0,192272600.0,0.467994,10.5,7600.0,0.056716,0.072185,1.487185,0.389708,18.126027,13067.827823,2014,-0.022301,1
1,2706,612545400.0,235609500.0,69924910.0,683849200.0,87963540.0,81056810.0,0.425468,19.099998,2154.0,0.132328,0.143603,1.116406,0.61536,7.293151,6737.201547,2014,0.009827,1
2,2708,784532800.0,283159900.0,32878240.0,580556100.0,61945540.0,53170300.0,0.420506,11.88,3320.0,0.067773,0.078959,0.740002,0.639072,2.986301,6458.847594,2014,0.009827,1
3,300357,263521500.0,12239680.0,55586910.0,193609800.0,81635140.0,69910010.0,0.469495,20.050005,2525.0,0.265291,0.309785,0.734702,0.953553,11.347945,8290.417208,2014,-0.022301,1
4,300358,1076322000.0,620864300.0,138893800.0,797205700.0,140028000.0,135133500.0,0.446298,40.0,1824.9813,0.125551,0.130099,0.740676,0.423161,11.210959,11954.156486,2014,-0.022301,1


## 对目标变量进行离散化处理

In [4]:
#得到ipo抑价率的均值
mean=ipo['One day excess return'].mean()

#建立新的离散型的目标变量，抑价率大于均值的新股被标记为1，反之被标记为0
ipo['flag']=0
ipo['flag'][ipo['One day excess return']>0.4247]=1

In [5]:
#对目标变量的两种类型的数量进行统计
ipo['flag'].value_counts()

1    508
0    497
Name: flag, dtype: int64

In [6]:
#查看不同类别下的首日日回报率的均值
ipo.groupby(['flag'])['One day excess return'].mean()

flag
0    0.381081
1    0.466876
Name: One day excess return, dtype: float64

## 建模

In [7]:
#导入所需要的模型
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import GradientBoostingClassifier  
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression  

#用于数据集的拆分
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

#用于模型的评价
from sklearn import metrics
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

### 方法

#### 得到混淆矩阵

In [8]:
def my_confusion_matrix(y_true, y_pred):
    #得到混淆矩阵
    #对验证集的结果进行评价，获得混淆矩阵的数据
    #参数：y_true验证集中目标变量的真实取值
    #      y_pred验证集中对目标变量的预测值
    #注：y_true和y_pred均为0、1二值序列
    #返回：TP,FN,FP,TN
    
        
    labels = list(set(y_true))  
    conf_mat = confusion_matrix(y_true, y_pred, labels = labels)
    return conf_mat[0][0],conf_mat[0][1],conf_mat[1][0],conf_mat[1][1]
  

#### 得到模型在验证集上的评估指标的数据

In [9]:
def getreport(test_y,y_pred):
    #得到模型在验证集上的评估指标的数据
    #得到准确率、召回率和f值
    #参数：y_true验证集中目标变量的真实取值
    #      y_pred验证集中对目标变量的预测值
    #注：y_true和y_pred均为0、1二值序列：
    #返回：precision、recall、f1
    
    
    report=classification_report(test_y, y_pred)
    report_list=re.findall("(.*?)\n",report.replace("\n\n","\n"))
    title=report_list[0].split()
    title.insert(0,"label")
    avg_total=report_list[3].split()[3:-1]
    return avg_total

#### 模型训练评估

In [10]:
def model_kflod(clf,X,y): 
    #十折交叉验证，进行模型的训练和预测，最终输出模型的各类评价指标，体现模型综合的性能
    #参数：clf：模型实例
    #      X：总的样本（不加上标签的）
    #      y:样本的标签
    #返回：'TP','TN','FP','FN','Accuracy','AUC','Precision','Recall','f1-score'系列值在十次交叉验证中的均值
    
    
    Report=[]
    kf = KFold(n_splits=10, random_state=41523033)
    train_test= kf.split(X,y)
    for train_index, test_index in train_test:
        train_x, test_x = X.loc[train_index], X.loc[test_index]
        train_y, test_y = y[train_index], y[test_index]        
        clf.fit(train_x, train_y)
        y_pred = clf.predict(test_x)
        test_auc =metrics.roc_auc_score(test_y,y_pred)
        TP,FN,FP,TN=my_confusion_matrix(test_y,y_pred)
        report=getreport(test_y,y_pred)
        Report.append([TP/(TP+FN),TN/(TN+FP),FP/(FP + TN),FN/(TP + FN),(TP+TN)/(TP+FN+FP+TN),test_auc]+report)
        
    REPORT=pd.DataFrame(Report,columns=['TP','TN','FP','FN','Accuracy','AUC','Precision','Recall','f1-score'])
    REPORT['Precision']=pd.to_numeric(REPORT['Precision'])
    REPORT['Recall']=pd.to_numeric(REPORT['Recall'])
    REPORT['f1-score']=pd.to_numeric(REPORT['f1-score'])
    return REPORT.mean()

#### 得到特征重要性

In [11]:
def model_feather(clf,X,y):
    #对于筛选出的模型，计算特征的重要性
    # 对于筛选出的模型，再次进行十折交叉验证，得到十次特征的重要性取值，并返回均值。
    
    Report=[]
    kf = KFold(n_splits=10, random_state=41523033)
    train_test= kf.split(X,y)
    for train_index, test_index in train_test:
        train_x, test_x = X.loc[train_index], X.loc[test_index]
        train_y, test_y = y[train_index], y[test_index]        
        clf.fit(train_x, train_y)
        Report.append(clf.feature_importances_)
    REPORT=pd.DataFrame(Report,columns=X.columns)
    return REPORT.mean()

### 模型

In [12]:
#数据准备，目标变量和解释变量
data=ipo.copy()
X=data.drop(['flag','One day excess return','Stkcd','year'],axis=1)
X.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) 
y=data['flag']

####  决策树

In [13]:
clf1 = DecisionTreeClassifier(criterion='gini')

In [14]:
# 模型的综合评价
Report=model_kflod(clf1,X,y)
print(Report)

TP           0.663076
TN           0.701319
FP           0.298681
FN           0.336924
Accuracy     0.702436
AUC          0.682198
Precision    0.734000
Recall       0.701000
f1-score     0.713000
dtype: float64


In [15]:
# 特征重要性
Report1=model_feather(clf1,X,y)
print(Report1)

Total assets                  0.019358
Total debts                   0.020331
Cash flow from operations     0.039304
Sales                         0.029166
Operating profit              0.026326
Net income                    0.023330
Offer price                   0.039371
number of shares sold         0.045862
ROA1                          0.025657
ROA2                          0.030440
Total assets turnover rate    0.046664
Debt ratio                    0.034596
Firm age                      0.066830
Issue proceeds(USD)           0.025089
Stock market sentiment        0.526223
Sales method                  0.001454
dtype: float64


#### Lightgbm

In [16]:
# 十折交叉验证，这里训练集和交叉验证集比例为7：3
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

# 准备好适用于lgb模型的训练集和验证集
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
#设定参数
params = {
 'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 41,
'num_threads':40,
'learning_rate': 0.1,
 'max_depth':6,
'verbose': 0
}

#模型训练
gbm = lgb.train(params,lgb_train,num_boost_round=10,valid_sets=lgb_eval, early_stopping_rounds=20)

#对验证集进行预测
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)  # 输出的是概率结果

[1]	valid_0's auc: 0.82607
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.828944
[3]	valid_0's auc: 0.830437
[4]	valid_0's auc: 0.835352
[5]	valid_0's auc: 0.835506
[6]	valid_0's auc: 0.835901
[7]	valid_0's auc: 0.839302
[8]	valid_0's auc: 0.835199
[9]	valid_0's auc: 0.83713
[10]	valid_0's auc: 0.836164
Did not meet early stopping. Best iteration is:
[7]	valid_0's auc: 0.839302


In [17]:
#模型的总体评价指标
y_pred1=pd.Series(preds).apply(lambda x:(x>=0.5)*1)
test_auc =metrics.roc_auc_score(y_test,y_pred1)
TP,FN,FP,TN=my_confusion_matrix(y_test,y_pred1)
report=getreport(y_test,y_pred1)
Report=([TP/(TP+FN),TN/(TN+FP),FP/(FP + TN),FN/(TP + FN),(TP+TN)/(TP+FN+FP+TN),test_auc]+report)
print(Report)

[0.6802721088435374, 0.896774193548387, 0.1032258064516129, 0.3197278911564626, 0.7913907284768212, 0.7885231511959623, '0.80', '0.79', '0.79']


In [18]:
#导出特征重要性
importance = gbm.feature_importance()
names = gbm.feature_name()
for index, im in enumerate(importance):
    feature = names[index] + ', ' + str(im) 
    print(feature)

Total_assets, 11
Total_debts, 6
Cash_flow_from_operations, 3
Sales, 3
Operating_profit, 5
Net_income, 6
Offer_price, 20
number_of_shares_sold, 4
ROA1, 1
ROA2, 0
Total_assets_turnover_rate, 10
Debt_ratio, 5
Firm_age, 13
Issue_proceeds(USD), 6
Stock_market_sentiment, 72
Sales_method, 0


#### GBDT

In [19]:
gbdt = GradientBoostingClassifier(  
    init=None,  
    learning_rate=0.01,  
    loss='deviance',  
    max_depth=6,  
    max_features=None,  
    max_leaf_nodes=None,  
    min_samples_leaf=1,  
    min_samples_split=2,  
    min_weight_fraction_leaf=0.0,  
    n_estimators=100,  
        random_state=None,  
    verbose=0,  
    warm_start=False)  

In [20]:
#模型的总体评价指标
Report=model_kflod(gbdt,X,y)
Report

TP           0.615439
TN           0.833385
FP           0.166615
FN           0.384561
Accuracy     0.749228
AUC          0.724412
Precision    0.775000
Recall       0.748000
f1-score     0.751000
dtype: float64

In [21]:
#特征的重要性
Report1=model_feather(gbdt,X,y)
Report1

Total assets                  0.010315
Total debts                   0.012524
Cash flow from operations     0.019966
Sales                         0.021060
Operating profit              0.013743
Net income                    0.017759
Offer price                   0.028029
number of shares sold         0.031858
ROA1                          0.016333
ROA2                          0.012326
Total assets turnover rate    0.034969
Debt ratio                    0.017089
Firm age                      0.041015
Issue proceeds(USD)           0.011081
Stock market sentiment        0.710458
Sales method                  0.001475
dtype: float64

#### 随机森林

In [22]:
clf = RandomForestClassifier(n_estimators=100,oob_score=True,criterion='gini')

In [23]:
#模型总体评价指标
Report=model_kflod(clf,X,y)
Report

TP           0.639249
TN           0.783783
FP           0.216217
FN           0.360751
Accuracy     0.722455
AUC          0.711516
Precision    0.761000
Recall       0.721000
f1-score     0.730000
dtype: float64

In [24]:
#特征的重要性
Report1=model_feather(clf,X,y)
Report1

Total assets                  0.042758
Total debts                   0.044920
Cash flow from operations     0.047608
Sales                         0.045517
Operating profit              0.041330
Net income                    0.041032
Offer price                   0.053496
number of shares sold         0.051824
ROA1                          0.045737
ROA2                          0.044173
Total assets turnover rate    0.049939
Debt ratio                    0.050949
Firm age                      0.063419
Issue proceeds(USD)           0.046455
Stock market sentiment        0.319748
Sales method                  0.011094
dtype: float64

#### 逻辑回归模型

In [25]:
clf = LogisticRegression()

In [26]:
#模型总体评价指标
Report=model_kflod(clf1,X,y)
Report

TP           0.643978
TN           0.691898
FP           0.308102
FN           0.356022
Accuracy     0.688446
AUC          0.667938
Precision    0.719000
Recall       0.687000
f1-score     0.699000
dtype: float64

#### 支持向量机

In [27]:
data1=ipo.copy()
data1['flag1']=-1
data1['flag1'][data1['One day excess return']>0.4247]=1
X=data1.drop(['flag','One day excess return','Stkcd','year','flag1'],axis=1)
y=data1['flag1']

In [28]:
clf = SVC(kernel='rbf')

In [29]:
#模型总体评价指标
Report=model_kflod(clf1,X,y)
Report

TP           0.678914
TN           0.677477
FP           0.322523
FN           0.321086
Accuracy     0.699416
AUC          0.678196
Precision    0.729000
Recall       0.698000
f1-score     0.709000
dtype: float64

In [30]:
#模型特征重要性
Report=model_feather(clf1,X,y)
Report

Total assets                  0.021722
Total debts                   0.017663
Cash flow from operations     0.035126
Sales                         0.035573
Operating profit              0.020506
Net income                    0.022244
Offer price                   0.039633
number of shares sold         0.050877
ROA1                          0.024990
ROA2                          0.027658
Total assets turnover rate    0.047068
Debt ratio                    0.028441
Firm age                      0.061599
Issue proceeds(USD)           0.031953
Stock market sentiment        0.533799
Sales method                  0.001147
dtype: float64