In [4]:
from IPython.display import display
import numpy as np
# import modin.pandas as pd
import pandas as pd
import datetime
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE, f_regression
from sklearn.metrics import r2_score

import itertools

import warnings
import json

warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline

# np.random.seed(4590)
data_path = r'../../../Data/train_data.csv'
df = pd.read_csv(data_path)
df.head(5)


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Unnamed: 0,企业编号,企业总评分,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,纳税A级年份增长1,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,1001,75.374276,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,107.58927,191.707773,63.791689,44.495607,0.151392,414.778035,801.5525,176.283983,148.327022,0.284957
1,1002,79.830122,2.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,46.903333,56.59,39.83,6.234116,0.023916,6.506667,7.04,5.01,0.702335,0.04533
2,1003,78.318264,2.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,84.275556,139.91,56.02,33.143654,-0.040224,54.918889,75.54,38.01,11.089465,0.031792
3,1004,83.253376,0.0,6.0,1.0,0.0,0.0,2.0,1.0,0.0,...,26.72,35.36,17.29,6.024438,0.081857,6.954444,7.9,6.24,0.618448,0.021711
4,1005,83.291493,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,94.05,110.26,77.85,9.652235,0.012921,108.584444,357.19,44.16,101.728838,0.284957


In [5]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=31)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)
feature_name=xtrain.columns.values

(2364, 335) (592, 335) (2364,) (592,)


# Scaling

In [7]:
sc_X = StandardScaler()
xtrain_sc = sc_X.fit_transform(xtrain)
xtest_sc = sc_X.transform(xtest)
xtrain_sc=pd.DataFrame(xtrain_sc)
xtest_sc=pd.DataFrame(xtest_sc)



# RFE

In [9]:
estimator = lgb.LGBMRegressor(objective='regression',
                              num_leaves=20,
                              learning_rate=0.005,
                              n_estimators=1500,
                              max_depth=7,
                              boosting='gbdt',
                              metric='rmse',
                              max_bin=35,
                              n_jobs=-1,
                              min_child_samples=100,
                              bagging_seed=11,
                              bagging_fraction=0.7,
                              bagging_freq=1,
                              min_data_in_leaf=20
                             )

In [11]:
for i in range(50,205, 5):
    selector c= RFE(estimator, i, step=1)
    selector = selector.fit(xtrain, ytrain)
    pre=selector.predict(xtest)
    print(i, np.sqrt(mean_squared_error(pre,ytest)))

50 3.2273654416859947
55 3.184101206673589
60 3.1910492763372065
65 3.1712941154571355
70 3.1609947276114183
75 3.164213017202633
80 3.167322947797274
85 3.166819816409272
90 3.1571167577548587
95 3.1534041260509857
100 3.149028256460868
105 3.108548825925692
110 3.1046150406430666
115 3.099830687222462
120 3.103991515339403


KeyboardInterrupt: 

# RFECV

In [21]:
selector_cv = RFECV(estimator, step=1,cv=5)
selector_cv = selector.fit(xtrain_sc, ytrain)
pre_cv=selector_cv.predict(xtest_sc)
np.sqrt(mean_squared_error(pre_cv,ytest))

3.1403841010185274

# GridSearchCV

In [6]:
params_test = {
#     'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
#     'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    
    'max_depth': range(3, 8, 2),
    'num_leaves': range(20, 120, 20),
    
    'max_bin':range(10,55,5),
    
#     'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
#     'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    
#     'min_data_in_leaf': [18, 19, 20, 21, 22],
#     'min_sum_hessian_in_leaf': [0.001, 0.002],
    
#     'min_child_samples':[80,100,120],
    
#     'bagging_freq':range(1,4,1),
    
#     'boosting':['gbdt','dart']
}
model_lgb = lgb.LGBMRegressor(objective='regression',
                              boosting='dart',
                              metric='rmse',
                              learning_rate=0.005,
                              n_estimators=2000,
                              n_jobs=-1,
#                               device='gpu'
                              )
gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test,
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
gsearch.fit(xtrain_sc, ytrain)

print(gsearch.cv_results_)
print(gsearch.best_params_)
print(gsearch.best_score_)
print('rmse:', np.sqrt(np.abs(gsearch.best_score_)))

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 37.9min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 97.7min
[Parallel(n_jobs=-1)]: Done 675 out of 675 | elapsed: 163.7min finished


{'mean_fit_time': array([13.17973118, 12.91148062, 13.58159389, 12.99349608, 14.40589938,
       22.34678535, 24.00866547, 23.83677621, 24.19611282, 23.79118986,
       27.18077135, 32.88478084, 33.8711123 , 36.66905527, 33.52618651,
       13.22350926, 13.49392099, 13.65189557, 13.05314016, 14.393291  ,
       24.30047798, 24.89591727, 25.46241565, 25.24459343, 25.17213278,
       27.89172192, 35.57857137, 36.10699773, 37.02868967, 36.67167993,
       13.55826321, 12.70834718, 12.98611689, 13.42915049, 13.15704727,
       25.04496441, 25.52019706, 25.31683884, 24.94953494, 25.73486366,
       29.26428175, 38.3198617 , 36.16792984, 37.21980748, 37.05444584,
       13.60043707, 14.24620686, 13.54873543, 14.30582118, 13.38555059,
       25.52551203, 26.1645936 , 26.3082077 , 26.47092767, 26.21964746,
       28.94213095, 38.68588123, 39.95330911, 40.01283584, 39.21588869,
       14.34951253, 14.03442731, 14.45389533, 13.93092928, 14.43413367,
       25.99794312, 27.69044232, 28.32051182, 

In [8]:
params_test2 = {
#     'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
#     'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    
    'max_depth': range(4, 8, 1),
    'num_leaves': range(20, 120, 20),
    
    'max_bin':range(10,55,5),
    
#     'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
#     'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    
    'min_data_in_leaf': [18, 19, 20, 21, 22],
    'min_sum_hessian_in_leaf': [0.001, 0.002],
    
#     'min_child_samples':[80,100,120],
    
#     'bagging_freq':range(1,4,1),
    
#     'boosting':['gbdt','dart']
}
model_lgb = lgb.LGBMRegressor(objective='regression',
                              boosting='gbdt',
                              metric='rmse',
                              learning_rate=0.005,
                              n_estimators=2000,
                              n_jobs=-1,
#                               device='gpu'
                              )
gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2,
                        scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
gsearch2.fit(xtrain_sc, ytrain)

print(gsearch2.cv_results_)
print(gsearch2.best_params_)
print(gsearch2.best_score_)
print('rmse:', np.sqrt(np.abs(gsearch2.best_score_)))

Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 24.1min


KeyboardInterrupt: 