In [0]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import warnings
import json


import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline


warnings.filterwarnings('ignore')


nf_data_path = r'./noFinacialFeatures.csv'
f_data_path = r'./FinacialFeatures.csv'
flevel = json.load(open(r'./feature_level.json'))

In [283]:
nf_df = pd.read_csv(nf_data_path)
f_df = pd.read_csv(f_data_path)
df = pd.DataFrame()
df = nf_df.copy()
f_df.drop(['企业总评分'], axis=1, inplace=True)
df = df.merge(f_df, how='left', on='企业编号')
# first_level = flevel['first_level']
# first_level.append('企业编号')
# first_level.append('企业总评分')
# df = df[first_level]
df.head(n=2)

Unnamed: 0,企业编号,企业总评分,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,资质证书数量,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,1001,75.374276,1.0,1.0,1.0,1.0,2.0,1.0,1.0,9.0,...,107.58927,191.707773,63.791689,44.495607,0.151392,414.778035,1089.655763,176.283983,325.371499,1.562757
1,1002,79.830122,2.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,46.903333,56.59,39.83,6.234116,0.023916,6.506667,7.04,5.01,0.702335,0.04533


In [0]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=0)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']


id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)

In [285]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape


((2364, 306), (592, 306), (2364,), (592,))

In [286]:
print(xtrain.isnull().values.any())
print(ytrain.isnull().values.any())
print(xtest.isnull().values.any())
print(ytest.isnull().values.any())


False
False
False
False


# normal xgboost

In [0]:
xgb = XGBRegressor(n_estimators=150, learning_rate=0.05, gamma=0.6, subsample=0.9,
                           colsample_bytree=0.7, max_depth=10)

In [277]:
traindf, testdf = train_test_split(xtrain, test_size = 0.25)
xgb.fit(xtrain,ytrain)
pred = xgb.predict(xtest)
np.sqrt(mse(pred.astype(int), ytest.astype(int)))

3.144869266392914

# CV xgboost

In [0]:
nfolds = 10
folds = KFold(n_splits=nfolds, random_state=3228, shuffle=True)


In [289]:
feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
mvalid = np.zeros(len(xtrain))
mfull = np.zeros(len(xtest))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
    x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]
    
    eval_s = [(x0, y0),(x1, y1)]
    model = XGBRegressor(n_estimators=150, 
                         #Booster Parameters
                         gamma=0.6, #minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
                         subsample=0.9,#用于训练模型的子样本占整个样本集合的比例，越小越不容易过拟合
                         colsample_bytree=0.7, #列采样率
                         max_depth=10,
                         min_child_weight=2,#bigger lead to the more conservative model
                         #Task Parameters
                         learning_rate=0.05, 
                         seed=0,#random number seed.
                         reg_alpha=0.05, #L1 加快收敛
                         reg_lambda=0.05,#L2 减少欠拟合
                         silent=0,
                         n_jobs=4,
               )
    
#     model.fit(x0, y0, eval_set=eval_s)
    model.fit(x0, y0)
    val_pred = model.predict(x1)
    
#     print(model.evals_result())
#     print("Training's rmse:", model.evals_result())
    print("validation's rmse:", np.sqrt(mse(val_pred.astype(int), y1.astype(int))))
    
    mvalid[val_idx] = val_pred
    feature_importance_df[:, fold_] = model.feature_importances_
    mfull += model.predict(xtest) / folds.n_splits

----
fold n°0
validation's rmse: 3.008426982891957
----
fold n°1
validation's rmse: 3.0119312671118355
----
fold n°2
validation's rmse: 3.583247472232051
----
fold n°3
validation's rmse: 3.1462253856351725
----
fold n°4
validation's rmse: 3.458592836223955
----
fold n°5
validation's rmse: 3.3830334397160913
----
fold n°6
validation's rmse: 3.0952396473579022
----
fold n°7
validation's rmse: 3.131307194409822
----
fold n°8
validation's rmse: 3.3185406226844325
----
fold n°9
validation's rmse: 3.058746835452361


In [290]:
np.sqrt(mse(mfull.astype(int), ytest.astype(int)))

3.0022514074463373

# 调参

# n_estimators

In [194]:
if __name__ == '__main__':
    cv_params = {'n_estimators': [130, 140, 150, 160, 200]}
    other_params = {'learning_rate': 0.08, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 3, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.6, 'reg_alpha': 0.05, 'reg_lambda': 0.05}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  2.3min finished


每轮迭代运行结果:{'mean_fit_time': array([17.56424332, 18.87256732, 20.36423516, 22.04060564, 23.7859684 ]), 'std_fit_time': array([0.2002459 , 0.03778668, 0.06950662, 0.19184203, 6.38827431]), 'mean_score_time': array([0.03501558, 0.03356137, 0.03444905, 0.03532848, 0.02332377]), 'std_score_time': array([0.00436023, 0.00359544, 0.00533983, 0.00365936, 0.00964787]), 'param_n_estimators': masked_array(data=[130, 140, 150, 160, 200],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 130}, {'n_estimators': 140}, {'n_estimators': 150}, {'n_estimators': 160}, {'n_estimators': 200}], 'split0_test_score': array([0.57621054, 0.57594076, 0.5759394 , 0.57576252, 0.57582863]), 'split1_test_score': array([0.54465649, 0.54502697, 0.54518926, 0.54511477, 0.54490472]), 'split2_test_score': array([0.56828119, 0.56842966, 0.56847902, 0.56845502, 0.56834584]), 'split3_test_score': array([0.60683492, 0.60674911, 0.6067486 , 0.6068

# max_depth and min_child_weight

In [195]:
if __name__ == '__main__':
    cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
    other_params = {'learning_rate': 0.08, 'n_estimators': 150, 'max_depth': 9, 'min_child_weight': 3, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.6, 'reg_alpha': 0.05, 'reg_lambda': 0.05}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('max_depth：{0},min_child_weight:{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 10.9min
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed: 15.1min finished


每轮迭代运行结果:{'mean_fit_time': array([ 7.33323565,  7.34172034,  7.33722315,  7.37527423,  7.38466058,
        7.3447473 ,  9.56732731,  9.5601944 ,  9.56833911,  9.58485465,
        9.55017705,  9.50229411, 11.83558292, 11.83603787, 11.8510437 ,
       11.71834865, 11.74273438, 11.68965731, 14.28775902, 14.17074885,
       13.98038678, 13.9750639 , 13.92168508, 13.82806854, 16.38414764,
       16.37769084, 16.20836487, 16.17229252, 16.12688875, 15.99377122,
       18.75542045, 18.62781868, 18.52733736, 18.36606259, 18.38191996,
       18.10140471, 20.8496942 , 20.66397872, 20.57886796, 20.35616512,
       20.09561081, 19.83977056, 23.07948689, 22.87864523, 22.4667717 ,
       22.21467595, 21.97876868, 20.26660161]), 'std_fit_time': array([0.09825663, 0.0222067 , 0.02517359, 0.04563102, 0.0449517 ,
       0.06922516, 0.07778709, 0.05614836, 0.05301108, 0.06239906,
       0.0599641 , 0.04118426, 0.06459701, 0.03353321, 0.09099415,
       0.09641914, 0.07068686, 0.04752075, 0.18199112, 0.111

# gamma

In [226]:
if __name__ == '__main__':
    cv_params = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
    other_params = {'learning_rate': 0.07, 'n_estimators': 150, 'max_depth': 10, 'min_child_weight': 2, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.6, 'reg_alpha': 0.05, 'reg_lambda': 0.05}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('gamma：{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  4.0min finished


每轮迭代运行结果:{'mean_fit_time': array([31.96277976, 31.55468364, 31.63713984, 31.62883296, 31.6820189 ,
       25.3752059 ]), 'std_fit_time': array([0.1363462 , 0.15351636, 0.11222892, 0.11722323, 0.13633238,
       7.67446434]), 'mean_score_time': array([0.04524426, 0.03843417, 0.04507022, 0.03960347, 0.04141483,
       0.02984662]), 'std_score_time': array([0.00197618, 0.00346056, 0.00550068, 0.002842  , 0.00102647,
       0.01056791]), 'param_gamma': masked_array(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'gamma': 0.1}, {'gamma': 0.2}, {'gamma': 0.3}, {'gamma': 0.4}, {'gamma': 0.5}, {'gamma': 0.6}], 'split0_test_score': array([0.60039072, 0.60343671, 0.60975426, 0.60284804, 0.60881705,
       0.60842844]), 'split1_test_score': array([0.54828033, 0.54390742, 0.5494109 , 0.5595681 , 0.5672792 ,
       0.56091868]), 'split2_test_score': array([0.58823163, 0.59094944, 0.57005

# subsample and colsample_bytree

In [227]:
if __name__ == '__main__':
    cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
    other_params = {'learning_rate': 0.07, 'n_estimators': 150, 'max_depth': 10, 'min_child_weight': 2, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.6, 'gamma': 0.6, 'reg_alpha': 0.05, 'reg_lambda': 0.05}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed: 10.7min finished


每轮迭代运行结果:{'mean_fit_time': array([23.78242092, 25.67630014, 26.67252574, 27.56512985, 27.986903  ,
       29.40303636, 30.6237556 , 31.75930157, 31.81347132, 33.21566505,
       34.51553679, 35.78695545, 35.14440069, 37.06083379, 38.54237328,
       36.82359896]), 'std_fit_time': array([0.28741622, 0.10859971, 0.07416588, 0.08045897, 0.12632599,
       0.10696252, 0.08962261, 0.23795924, 0.18993598, 0.09655119,
       0.05753449, 0.11539296, 0.13100526, 0.22912155, 0.2168005 ,
       3.36809262]), 'mean_score_time': array([0.04358253, 0.04231687, 0.04251099, 0.04080462, 0.04355788,
       0.04292064, 0.04326615, 0.04056525, 0.04438891, 0.03955889,
       0.04130726, 0.04213924, 0.0419158 , 0.04172258, 0.0414259 ,
       0.02244325]), 'std_score_time': array([0.00190018, 0.0035572 , 0.0024762 , 0.00324208, 0.00328437,
       0.00287318, 0.0059837 , 0.00568023, 0.00434104, 0.0028873 ,
       0.00160382, 0.00190768, 0.004543  , 0.00262743, 0.00265836,
       0.00889284]), 'param_colsample

# 正则 reg_alpha and reg_lambda

In [228]:
if __name__ == '__main__':
    cv_params = {'reg_alpha':  [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5], 'reg_lambda':  [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]}
    other_params = {'learning_rate': 0.07, 'n_estimators': 150, 'max_depth': 10, 'min_child_weight': 2, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.6, 'reg_alpha': 0.05, 'reg_lambda': 0.05}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 25.6min
[Parallel(n_jobs=4)]: Done 245 out of 245 | elapsed: 32.6min finished


每轮迭代运行结果:{'mean_fit_time': array([31.74635334, 32.68744974, 32.35825982, 32.3144978 , 31.70077744,
       30.54707174, 29.9909071 , 32.78620605, 32.75365443, 32.39127998,
       32.18095555, 31.83298364, 30.55723367, 29.89607196, 32.65318594,
       32.64464431, 32.55304632, 32.01894317, 31.48980231, 30.73541589,
       29.93264618, 32.48803935, 32.38079925, 32.44187436, 32.02948995,
       31.46555958, 30.41905293, 30.27556448, 32.42674761, 32.5166348 ,
       32.35944681, 31.96868734, 31.36340075, 30.44204345, 30.00032744,
       32.87184796, 32.57523446, 32.488486  , 32.13642836, 31.69837403,
       30.73108311, 30.31398211, 32.73294029, 32.96053123, 32.6097384 ,
       32.42431841, 31.80779476, 30.85275521, 26.34174571]), 'std_fit_time': array([0.56210664, 0.22780692, 0.17774728, 0.22982524, 0.10308531,
       0.17632833, 0.21796178, 0.16473184, 0.19314507, 0.12609506,
       0.18901487, 0.14481106, 0.20262197, 0.14381712, 0.1842557 ,
       0.32372791, 0.11114398, 0.14403201, 0.15

# learning_rat

In [229]:
if __name__ == '__main__':
    cv_params = {'learning_rate': [0.005,0.01,0.05, 0.07, 0.1, 0.2]}
    other_params = {'learning_rate': 0.07, 'n_estimators': 150, 'max_depth': 10, 'min_child_weight': 2, 'seed': 0,
                    'subsample': 0.9, 'colsample_bytree': 0.7, 'gamma': 0.6, 'reg_alpha': 0.03, 'reg_lambda': 0.03}
#     other_params = {'learning_rate': 0.07, 'n_estimators': 110, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
#                     'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
    model = XGBRegressor(**other_params)
    optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_GBM.fit(x0, y0)
    evalute_result = optimized_GBM.cv_results_
    print('每轮迭代运行结果:{0}'.format(evalute_result))
    print('{0}'.format(optimized_GBM.best_params_))
    print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.8min finished


每轮迭代运行结果:{'mean_fit_time': array([25.80720282, 28.30362964, 31.09535322, 31.88126473, 32.59619751,
       27.60748568]), 'std_fit_time': array([0.42770693, 0.25723501, 0.13083627, 0.1503857 , 0.10376498,
       6.8424069 ]), 'mean_score_time': array([0.03715777, 0.04055247, 0.04266539, 0.03888044, 0.03668323,
       0.0209456 ]), 'std_score_time': array([0.00381923, 0.00442107, 0.00108792, 0.00277953, 0.00265494,
       0.00856437]), 'param_learning_rate': masked_array(data=[0.005, 0.01, 0.05, 0.07, 0.1, 0.2],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'learning_rate': 0.005}, {'learning_rate': 0.01}, {'learning_rate': 0.05}, {'learning_rate': 0.07}, {'learning_rate': 0.1}, {'learning_rate': 0.2}], 'split0_test_score': array([-59.07117015, -12.73865776,   0.60044317,   0.58875466,
         0.59089138,   0.55519002]), 'split1_test_score': array([-58.87767054, -12.75342084,   0.54284798,   0.54047534,
      

# feature select