In [13]:
from IPython.display import display
import numpy as np
# import modin.pandas as pd
import pandas as pd
import datetime
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE, f_regression

import itertools

import warnings
import json

warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline

# np.random.seed(4590)
nf_data_path = r'../../../Data/noFinacialFeatures.csv'
f_data_path = r'../../../Data//FinacialFeatures.csv'

# flevel = json.load(open(r'./feature_level.json'))

In [14]:
nf_df = pd.read_csv(nf_data_path)
f_df = pd.read_csv(f_data_path)

In [15]:
df = pd.DataFrame()
df = nf_df.copy()
f_df.drop(['企业总评分'], axis=1, inplace=True)
df = df.merge(f_df, how='left', on='企业编号')

In [16]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=31)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)

(2364, 307) (592, 307) (2364,) (592,)


In [18]:
nfolds = 10
folds = KFold(n_splits=nfolds, shuffle=True, random_state=15)

In [19]:
def train_lgbm(xtrain, ytrain, xtest, ytest, params):
    feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
    mvalid = np.zeros(len(xtrain))
    mfull = np.zeros(len(xtest))
    models = []

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
        print('----')
        print("fold n°{}".format(fold_))

        x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
        x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]

        trn_data = lgb.Dataset(x0, label=y0)
        val_data = lgb.Dataset(x1, label=y1)

        num_round = 10000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=150)

        mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

        feature_importance_df[:, fold_] = clf.feature_importance()

        mfull += clf.predict(xtest,
                             num_iteration=clf.best_iteration) / folds.n_splits

        models.append(clf)

    test_error = np.sqrt(mean_squared_error(
        mfull.astype(int), ytest.astype(int)))
    print()
    print('rmse:', test_error)
    return models, test_error, feature_importance_df

In [20]:
params_all = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",

    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100,
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy
    # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "min_data_in_leaf": 30,
    "max_depth": 7,  # limit the tree depth explicitly.

    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    #     "max_bin": 5, # more small more faster
    "bagging_seed": 11,

    # For Better Accuracy
    "max_bin": 20,  # lager but slower
    "learning_rate": 0.005,

    # deal with over fitting
    # Use small max_bin
    # Use small num_leaves
    # Use min_data_in_leaf and min_sum_hessian_in_leaf
    # Use bagging by set bagging_fraction and bagging_freq
    # Use feature sub-sampling by set feature_fraction
    # Use bigger training data
    # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
    # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.9,

    "min_child_samples": 100,

    # other
    "n_estimators": 1500,
    "verbosity": -1,
    "n_jobs": -1,
}

In [21]:
params_range_dict = {  # for the Leaf-wise (Best-first) Tree
    "num_leaves": np.arange(10, 160, 10, dtype=int),
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy
    #     "min_data_in_leaf": np.arange(10,55,5), # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    # limit the tree depth explicitly.
    "max_depth": np.arange(4, 13, 1, dtype=int),

    #     # For Faster Speed
    #     "bagging_fraction": np.arange(0.5, 1,0.1),

    #     # For Better Accuracy
    "max_bin": np.arange(10, 60, 5, dtype=int),  # lager but slower

    #     "feature_fraction": np.arange(0.3, 1, 0.1),
    #     "lambda_l1": np.arange(0.01, 1, 0.05),

    #     "min_child_samples": np.arange(70, 110,10),
}

# params_range_dict ={"num_leaves": np.arange(10,30,10, dtype=int)}

In [22]:
def search_params(params_range_dict, xtrain, ytrain, xtest, ytest):
    best_rmse = np.inf
    count = 0
    train_dict = {}
    params_name = list(params_range_dict.keys())
    params_values = []
    for name in params_name:
        params_values.append(params_range_dict[name])

    c_params = list(itertools.product(*params_values))
    idx = [i for i in range(0, len(c_params))]

    params_df = pd.DataFrame(c_params, index=idx, columns=params_name)
    print('Iteration number:', len(params_df))
    for row in range(0, len(params_df)):
        print('*********************************Iteration: %s****************************************************' % str(count))
        starttime = time.time()
        each_params = params_df.iloc[row].to_dict()
        print('Current params:', each_params)
        params_all.update(each_params)
        models, test_error, feature_importance_df = train_lgbm(
            xtrain, ytrain, xtest, ytest, params_all)

        print("Used %s seconds" % (time.time() - starttime))

        count += 1

        if test_error < best_rmse:
            best_rmse = test_error
            train_dict = {'models': models, 'rmse': test_error,
                          'feature_importance': feature_importance_df, 'params': each_params}
        print('Current best:', best_rmse, 'Params:', train_dict['params'])
        print()

    return train_dict

In [None]:
train_dict = search_params(params_range_dict, xtrain, ytrain, xtest, ytest)
train_dict

Iteration number: 1350
*********************************Iteration: 0****************************************************
Current params: {'num_leaves': 10, 'max_depth': 4, 'max_bin': 10}
----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.95334	valid_1's rmse: 3.62696
[1000]	training's rmse: 2.62557	valid_1's rmse: 3.53733
[1500]	training's rmse: 2.41282	valid_1's rmse: 3.51127
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 2.41282	valid_1's rmse: 3.51127
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 3.01238	valid_1's rmse: 2.98539
[1000]	training's rmse: 2.67513	valid_1's rmse: 2.88633
Early stopping, best iteration is:
[1293]	training's rmse: 2.53672	valid_1's rmse: 2.87914
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.98382	valid_1's rmse: 3.33365
[1000]	training's rmse: 2.64734	valid_1's rmse: 3.2679

[1000]	training's rmse: 2.62396	valid_1's rmse: 3.04137
Early stopping, best iteration is:
[1188]	training's rmse: 2.53514	valid_1's rmse: 3.03545
----
fold n°6
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.96068	valid_1's rmse: 3.32344
[1000]	training's rmse: 2.62003	valid_1's rmse: 3.17666
[1500]	training's rmse: 2.39609	valid_1's rmse: 3.15827
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 2.39609	valid_1's rmse: 3.15827
----
fold n°7
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.94637	valid_1's rmse: 3.42669
[1000]	training's rmse: 2.60136	valid_1's rmse: 3.29838
Early stopping, best iteration is:
[1239]	training's rmse: 2.48692	valid_1's rmse: 3.28793
----
fold n°8
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.95951	valid_1's rmse: 3.13541
[1000]	training's rmse: 2.61093	valid_1's rmse: 3.05667
Early stopping, best iteration is:
[10