In [0]:
from IPython.display import display
import numpy as np
# import modin.pandas as pd
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE, f_regression

import warnings
import json

warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline

# np.random.seed(4590)
nf_data_path = r'./noFinacialFeatures.csv'
f_data_path = r'./FinacialFeatures.csv'

flevel = json.load(open(r'./feature_level.json'))

In [0]:
nf_df = pd.read_csv(nf_data_path)
f_df = pd.read_csv(f_data_path)

In [0]:
df = pd.DataFrame()
df = nf_df.copy()
f_df.drop(['企业总评分'], axis=1, inplace=True)
df = df.merge(f_df, how='left', on='企业编号')

In [0]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=0)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((2364, 307), (592, 307), (2364,), (592,))

In [0]:
id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)

In [0]:
def print_(gs):
    print(gs.cv_results_)
    print(gs.best_params_)
    print(gs.best_score_)
    print('rmse:', np.sqrt(np.abs(gs.best_score_)))

# 调参

## 学习率和估计器及其数目

In [0]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',

          'learning_rate': 0.1,
          'num_leaves': 50,
          'max_depth': 6,

          'subsample': 0.8,
          'colsample_bytree': 0.8,
          "metric": 'rmse', }

In [0]:
data_train = lgb.Dataset(xtrain, ytrain, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])

[50]	cv_agg's rmse: 3.18014 + 0.126309
best n_estimators: 43
best cv score: 3.1787510439261384


## max_depth 和 num_leaves

In [0]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                              learning_rate=0.1, n_estimators=43, max_depth=6,
                              metric='rmse', bagging_fraction=0.8, feature_fraction=0.8)

params_test1 = {
    'max_depth': range(3, 8, 2),
    'num_leaves': range(20, 100, 30)
}
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1,
                        scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=2)
gsearch1.fit(xtrain, ytrain)
# print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
# print('rmse:', np.sqrt(np.abs(gsearch1.best_score_)))
print_(gsearch1)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   32.9s
[Parallel(n_jobs=2)]: Done  90 out of  90 | elapsed:  1.4min finished


{'mean_fit_time': array([0.92085559, 0.95250058, 0.81514375, 1.7938025 , 1.6340225 ,
       1.4990643 , 1.79567962, 2.53424389, 2.49483378]), 'std_fit_time': array([0.00707319, 0.048489  , 0.1618288 , 0.20759863, 0.29997807,
       0.26141865, 0.29782629, 0.54207498, 0.4510752 ]), 'mean_score_time': array([0.10262785, 0.09872396, 0.07656128, 0.09280767, 0.07676151,
       0.0744283 , 0.078845  , 0.07643392, 0.07436543]), 'std_score_time': array([0.00187215, 0.01305794, 0.01944329, 0.01111545, 0.01643953,
       0.01687405, 0.02342747, 0.01855839, 0.01470031]), 'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 7, 7, 7],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_num_leaves': masked_array(data=[20, 50, 80, 20, 50, 80, 20, 50, 80],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dt

In [0]:
params_test2 = {
    'max_depth': [4, 5, 6],
    'num_leaves': [14, 16, 18, 20, 22, 24]
}

gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2,
                        scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=2)
gsearch2.fit(xtrain, ytrain)
print(gsearch2.cv_results_)
print(gsearch2.best_params_)
print(gsearch2.best_score_)
print('rmse:', np.sqrt(np.abs(gsearch2.best_score_)))

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   30.3s
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed:  2.6min finished


{'mean_fit_time': array([1.36653383, 1.09408345, 1.03487799, 1.03452284, 1.07209079,
       1.03925378, 1.26209984, 1.32340829, 1.38175254, 1.73965721,
       1.89382095, 1.86431124, 1.78585267, 1.89364927, 1.93473694,
       1.71859808, 1.77160425, 2.18911905]), 'std_fit_time': array([0.04607406, 0.21578606, 0.17471289, 0.18349589, 0.18418632,
       0.18045577, 0.22674089, 0.23940779, 0.25423431, 0.25700037,
       0.07969668, 0.04767033, 0.01752817, 0.03163411, 0.28548189,
       0.36152996, 0.31908205, 0.48044758]), 'mean_score_time': array([0.10029099, 0.07462955, 0.07196858, 0.07183502, 0.07426641,
       0.07207785, 0.07141411, 0.07167449, 0.07225733, 0.09043505,
       0.09697895, 0.09515953, 0.09257777, 0.09314201, 0.09124854,
       0.07639015, 0.0793395 , 0.08808327]), 'std_score_time': array([0.01312466, 0.01736638, 0.0132277 , 0.01328873, 0.01367796,
       0.01359791, 0.0135743 , 0.01354999, 0.01431537, 0.01352738,
       0.00437607, 0.00464146, 0.00053888, 0.00126583, 0.

## min_data_in_leaf 和 min_sum_hessian_in_leaf

In [24]:
params_test3 = {
    'min_data_in_leaf': [18, 19, 20, 21, 22],
    'min_sum_hessian_in_leaf': [0.001, 0.002]
}
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=100,
                              learning_rate=0.1, n_estimators=941, max_depth=7,
                              metric='rmse', bagging_fraction=0.7, feature_fraction=0.8)
gsearch3 = GridSearchCV(estimator=model_lgb, param_grid=params_test3,
                        scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=8)
gsearch3.fit(xtrain, ytrain)
print_(gsearch3)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: ignored

## feature_fraction 和 bagging_fraction

In [0]:
params_test4 = {
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=16,
                              learning_rate=0.1,
                              n_estimators=43,
                              max_depth=6,
                              metric='rmse',
                              bagging_freq=5,
                              min_child_samples=20,
                              min_child_weight=0.001)
gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4,
                        scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch4.fit(xtrain, ytrain)
print_(gsearch4)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   25.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  2.6min finished


{'mean_fit_time': array([1.52890165, 1.68892879, 1.99096341, 2.45279725, 2.96322303,
       1.53853116, 1.81281204, 1.9841588 , 2.50049465, 2.72619491,
       1.80894279, 2.02389319, 2.2905458 , 2.38861339, 2.62607818,
       1.72504992, 1.91050556, 2.28192964, 2.60278957, 2.64744437,
       1.66610479, 2.00411994, 2.25824735, 2.47829208, 2.62041569]), 'std_fit_time': array([0.1440184 , 0.13613726, 0.13121932, 0.31410123, 0.52506249,
       0.24742629, 0.16123335, 0.18808681, 0.20924647, 0.20986292,
       0.21038671, 0.2015985 , 0.21959279, 0.14541002, 0.16114324,
       0.12275598, 0.10543616, 0.1872898 , 0.1722875 , 0.07166341,
       0.15272759, 0.18114585, 0.16659856, 0.17745014, 0.22466804]), 'mean_score_time': array([0.12549422, 0.10039601, 0.10719349, 0.14675357, 0.14070678,
       0.1203696 , 0.13885729, 0.11928558, 0.13347023, 0.14235203,
       0.14960563, 0.13403726, 0.15306253, 0.12587166, 0.11638017,
       0.12471328, 0.12208233, 0.1329752 , 0.12285206, 0.12949545,
     

## 正则化参数

In [0]:
params_test6 = {
    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
}
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=16,
                              learning_rate=0.1,
                              n_estimators=43,
                              max_depth=6,
                              metric='rmse',
                              min_child_samples=20,
                              min_child_weight=0.001,
                              feature_fraction=0.8,
                              bagging_fraction=0.7)
gsearch6 = GridSearchCV(estimator=model_lgb, param_grid=params_test6,
                        scoring='neg_mean_squared_error', cv=10, verbose=1, n_jobs=4)
gsearch6.fit(xtrain, ytrain)
print_(gsearch6)

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   30.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done 490 out of 490 | elapsed:  6.0min finished


{'mean_fit_time': array([2.54953973, 2.48352652, 2.44470265, 2.40663574, 2.44401784,
       3.07249022, 3.06396115, 2.58210669, 2.4684319 , 2.49551215,
       2.5604156 , 2.50873158, 2.49764013, 2.68728771, 2.49779704,
       2.57869885, 2.59264336, 2.55016713, 2.55847514, 2.60326304,
       2.5180795 , 2.59333334, 2.56451044, 2.54676473, 2.88733516,
       2.73384731, 2.60004342, 2.62316666, 2.47140055, 2.5543318 ,
       2.62194357, 2.62157247, 2.66752541, 2.6159936 , 2.75703018,
       2.63211706, 2.64107139, 2.85074286, 2.56648724, 2.63695562,
       2.6791122 , 2.55771363, 2.59988146, 2.67818027, 2.7400804 ,
       2.78853831, 2.69739177, 2.81976368, 2.52642329]), 'std_fit_time': array([0.15366366, 0.14704099, 0.11413499, 0.07396026, 0.11799336,
       0.58883204, 0.49783955, 0.21547983, 0.12424784, 0.13143344,
       0.18417971, 0.21139881, 0.16976621, 0.19555309, 0.1566916 ,
       0.19838069, 0.09192913, 0.17775724, 0.13742148, 0.16377159,
       0.18933214, 0.20228498, 0.33839

## 降低learning_rate

In [0]:
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',

#     'learning_rate': 0.005,
#     'num_leaves': 80,
#     'max_depth': 7,
#     'min_data_in_leaf': 20,

#     'subsample': 1,
#     'colsample_bytree': 0.7,
# }

params = {
    'objective': 'regression',
    'num_leaves': 16,
    'learning_rate': 0.005,
#     'n_estimators': 43,
    'max_depth': 6,
    'metric': 'rmse',
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7
}

data_train = lgb.Dataset(xtrain, ytrain, silent=True)
cv_results = lgb.cv(
    params, data_train, num_boost_round=10000, nfold=10, stratified=False, shuffle=True, metrics='rmse',
    early_stopping_rounds=50, verbose_eval=100, show_stdv=True)

print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])

[100]	cv_agg's rmse: 4.12491 + 0.260789
[200]	cv_agg's rmse: 3.64173 + 0.256248
[300]	cv_agg's rmse: 3.40297 + 0.244103
[400]	cv_agg's rmse: 3.28311 + 0.227424
[500]	cv_agg's rmse: 3.21311 + 0.211949
[600]	cv_agg's rmse: 3.17422 + 0.202855
[700]	cv_agg's rmse: 3.1572 + 0.198252
[800]	cv_agg's rmse: 3.14919 + 0.194758
[900]	cv_agg's rmse: 3.14666 + 0.190449
best n_estimators: 941
best cv score: 3.145967142110073


# Cross validation

## Sample

In [0]:
nfolds = 10
folds = KFold(n_splits=nfolds, shuffle=True, random_state=15)

In [0]:
# params = {'num_leaves': 50,
#           'min_data_in_leaf': 30,
#           'objective': 'regression',
#           'max_depth': 6,
#           'learning_rate': 0.1,
#           "min_child_samples": 100,
#           "boosting": "gbdt",
#           "feature_fraction": 0.8,
#           "bagging_freq": 1,
#           "bagging_fraction": 0.7,
#           "bagging_seed": 11,
#           "metric": 'rmse',
#           "lambda_l1": 0.1,
#           "verbosity": -1,
#           "device":'cpu',
#           "n_jobs":4,
#           "n_estimators":941}


params = {'num_leaves': 80,
         'min_data_in_leaf': 10,
         'objective': 'regression',
         'max_depth': 7,
         'learning_rate': 0.005,
         "min_child_samples": 100,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.7,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "n_jobs":8,
         "n_estimators":941}


In [20]:
feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
mvalid = np.zeros(len(xtrain))
mfull = np.zeros(len(xtest))


for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
    print('----')
    print("fold n°{}".format(fold_))

    x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
    x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]

    trn_data = lgb.Dataset(x0, label=y0)
    val_data = lgb.Dataset(x1, label=y1)

    num_round = 10000
    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=150)
    
    mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

    feature_importance_df[:, fold_] = clf.feature_importance()

    mfull += clf.predict(xtest,
                         num_iteration=clf.best_iteration) / folds.n_splits
    
    
np.sqrt(mean_squared_error(mfull.astype(int), ytest.astype(int)))

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 1.97346	valid_1's rmse: 3.00919
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.40454	valid_1's rmse: 2.98727
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 1.96027	valid_1's rmse: 3.35494
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.36978	valid_1's rmse: 3.32063
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 1.96716	valid_1's rmse: 3.08877
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.39434	valid_1's rmse: 3.05962
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 1.95053	valid_1's rmse: 3.30398
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.37984	valid_1's rmse: 3.22539
----
fold n°4
Training until validation scores don't improve

3.0064682521775197

## Traing with feature level

In [0]:
params = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",
    
    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 7, # limit the tree depth explicitly.
    
    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,
    
    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,
    
    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,
    
    "min_child_samples": 100,
    
    # other
    "n_estimators": 941,
    "verbosity": -1,
    "n_jobs":8,
}

In [0]:
def train_lgbm(xtrain, ytrain, xtest, ytest, params):
    feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
    mvalid = np.zeros(len(xtrain))
    mfull = np.zeros(len(xtest))


    for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
        print('----')
        print("fold n°{}".format(fold_))

        x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
        x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]

        trn_data = lgb.Dataset(x0, label=y0)
        val_data = lgb.Dataset(x1, label=y1)

        num_round = 10000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=150)

        mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

        feature_importance_df[:, fold_] = clf.feature_importance()

        mfull += clf.predict(xtest,
                             num_iteration=clf.best_iteration) / folds.n_splits


    print(np.sqrt(mean_squared_error(mfull.astype(int), ytest.astype(int))))

### Load feature level

In [0]:
def load_level(level):
    l = flevel[level]
    return xtrain[l], xtest[l]
  
def pre_rmse(pre):
    return np.sqrt(mean_squared_error(pre.astype(int), ytest.astype(int)))

### All features

In [29]:
train_lgbm(xtrain, ytrain, xtest, ytest, params)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.28351	valid_1's rmse: 2.9736
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.73616	valid_1's rmse: 2.94198
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.26514	valid_1's rmse: 3.31948
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.71447	valid_1's rmse: 3.27979
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.28777	valid_1's rmse: 3.08126
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.74145	valid_1's rmse: 3.03255
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.26954	valid_1's rmse: 3.30447
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.73204	valid_1's rmse: 3.22435
----
fold n°4
Training until validation scores don't improve 

### First level

In [0]:
params1 = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",
    
    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 7, # limit the tree depth explicitly.
    
    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,
    
    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,
    
    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,
    
    "min_child_samples": 100,
    
    # other
    "n_estimators": 941,
    "verbosity": -1,
    "n_jobs":8,
}

In [32]:
xtrain_1, xtest_1 = load_level('first_level')
train_lgbm(xtrain_1, ytrain, xtest_1, ytest, params1)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.3675	valid_1's rmse: 2.92858
Early stopping, best iteration is:
[689]	training's rmse: 2.11552	valid_1's rmse: 2.89717
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.34513	valid_1's rmse: 3.34671
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.82795	valid_1's rmse: 3.30534
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.36314	valid_1's rmse: 3.12307
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.84369	valid_1's rmse: 3.0876
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.35267	valid_1's rmse: 3.31827
Did not meet early stopping. Best iteration is:
[941]	training's rmse: 1.84261	valid_1's rmse: 3.21337
----
fold n°4
Training until validation scores don't improve for 150 rounds

### Second level

In [0]:
params2 = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",
    
    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 7, # limit the tree depth explicitly.
    
    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,
    
    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,
    
    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,
    
    "min_child_samples": 100,
    
    # other
    "n_estimators": 941,
    "verbosity": -1,
    "n_jobs":8,
}

In [33]:
xtrain_2, xtest_2 = load_level('second_level')
train_lgbm(xtrain_2, ytrain, xtest_2, ytest, params2)

KeyError: ignored

In [51]:
xtrain['权益:实收资本(或股本)(元)_std']

KeyError: ignored

### Third level

In [37]:
params3 = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",
    
    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 200, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 8, # limit the tree depth explicitly.
    
    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,
    
    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,
    
    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,
    
    "min_child_samples": 100,
    
    # other
    "n_estimators": 941,
    "verbosity": -1,
    "n_jobs":8,
}
xtrain_3, xtest_3 = load_level('third_level')
train_lgbm(xtrain_3, ytrain, xtest_3, ytest, params3)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[239]	training's rmse: 4.79583	valid_1's rmse: 4.86867
----
fold n°1
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[295]	training's rmse: 4.72448	valid_1's rmse: 5.2214
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 4.66476	valid_1's rmse: 4.94402
Early stopping, best iteration is:
[519]	training's rmse: 4.65681	valid_1's rmse: 4.94305
----
fold n°3
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[213]	training's rmse: 4.81047	valid_1's rmse: 4.99319
----
fold n°4
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 4.62643	valid_1's rmse: 5.16595
Early stopping, best iteration is:
[474]	training's rmse: 4.63691	valid_1's rmse: 5.16354
----
fold n°5
Training until validation scores don't imp

In [44]:
rf = RandomForestRegressor(n_jobs=8, n_estimators=500, verbose=3)
rf.fit(xtrain_3, ytrain)
rf_ypre = rf.predict(xtest_3)
pre_rmse(rf_ypre)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s


building tree 1 of 500building tree 2 of 500building tree 3 of 500building tree 4 of 500building tree 5 of 500building tree 6 of 500building tree 7 of 500building tree 8 of 500







building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500building tree 13 of 500
building tree 14 of 500

building tree 16 of 500building tree 15 of 500

building tree 18 of 500building tree 17 of 500building tree 19 of 500


building tree 20 of 500building tree 21 of 500building tree 22 of 500

building tree 23 of 500
building tree 24 of 500
building tree 25 of 500

building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 31 of 500building tree 30 of 500
building tree 29 of 500building tree 32 of 500

building tree 33 of 500building tree 34 of 500


building tree 35 of 500
building tree 36 of 500building tree 37 of 500

building tree 39 of 500building tree 40 of 500

building tree 38 of 500building tree 41 of 500

building tree 42 of 500
b

[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.6s


building tree 140 of 500
building tree 143 of 500building tree 141 of 500
building tree 142 of 500building tree 144 of 500


building tree 146 of 500building tree 147 of 500building tree 145 of 500

building tree 148 of 500
building tree 149 of 500

building tree 150 of 500
building tree 151 of 500building tree 153 of 500building tree 152 of 500

building tree 154 of 500
building tree 155 of 500
building tree 156 of 500

building tree 157 of 500
building tree 159 of 500building tree 160 of 500building tree 158 of 500

building tree 161 of 500
building tree 162 of 500

building tree 164 of 500
building tree 163 of 500building tree 166 of 500

building tree 165 of 500building tree 167 of 500

building tree 169 of 500
building tree 170 of 500
building tree 171 of 500building tree 168 of 500building tree 172 of 500


building tree 173 of 500building tree 174 of 500

building tree 176 of 500building tree 175 of 500
building tree 177 of 500

building tree 178 of 500
building tree 179 of 500b

[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    1.3s


building tree 330 of 500
building tree 331 of 500
building tree 332 of 500
building tree 329 of 500
building tree 334 of 500
building tree 335 of 500
building tree 337 of 500
building tree 336 of 500
building tree 333 of 500
building tree 339 of 500
building tree 341 of 500building tree 340 of 500building tree 338 of 500
building tree 342 of 500


building tree 344 of 500
building tree 343 of 500building tree 345 of 500
building tree 346 of 500building tree 347 of 500
building tree 348 of 500


building tree 350 of 500building tree 349 of 500
building tree 352 of 500
building tree 353 of 500building tree 351 of 500

building tree 354 of 500building tree 355 of 500


building tree 356 of 500
building tree 360 of 500building tree 357 of 500
building tree 358 of 500

building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500building tree 359 of 500

building tree 366 of 500
building tree 368 of 500building tree 367 of 500

building tree 365 of 500


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished


5.346917525736396

In [48]:
adar = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                         n_estimators=941, random_state=np.random.RandomState(100))
adar.fit(xtrain_3, ytrain)
adar_ypre = adar.predict(xtest_3)
pre_rmse(adar_ypre)

5.218696896476868

### Fourth level

In [0]:
params4 = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",
    
    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 7, # limit the tree depth explicitly.
    
    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,
    
    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,
    
    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,
    
    "min_child_samples": 100,
    
    # other
    "n_estimators": 941,
    "verbosity": -1,
    "n_jobs":8,
}

In [53]:
xtrain_4, xtest_4 = load_level('fourth_level')
train_lgbm(xtrain_4, ytrain, xtest_4, ytest, params4)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 5.0783	valid_1's rmse: 4.94084
Early stopping, best iteration is:
[722]	training's rmse: 5.07819	valid_1's rmse: 4.94024
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 5.0383	valid_1's rmse: 5.29781
Early stopping, best iteration is:
[501]	training's rmse: 5.0383	valid_1's rmse: 5.29783
----
fold n°2
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[26]	training's rmse: 5.07087	valid_1's rmse: 5.10368
----
fold n°3
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[48]	training's rmse: 5.07289	valid_1's rmse: 5.06709
----
fold n°4
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 5.03899	valid_1's rmse: 5.29337
Did not meet early stopping. Best iteration is:
[844]	training's rmse: 5.03886	valid_1's rmse: 