In [1]:
from IPython.display import display
import numpy as np
# import modin.pandas as pd
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE, f_regression

import warnings
import json

warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline

# np.random.seed(4590)
nf_data_path = r'../../../Data/noFinacialFeatures.csv'
f_data_path = r'../../../Data/FinacialFeatures.csv'

flevel = json.load(open(r'../../FeatureEngineering/feature_level.json'))

In [2]:
nf_df = pd.read_csv(nf_data_path)
f_df = pd.read_csv(f_data_path)
df = pd.DataFrame()
df = nf_df.copy()
f_df.drop(['企业总评分'], axis=1, inplace=True)
df = df.merge(f_df, how='left', on='企业编号')

In [3]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=0)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((2364, 307), (592, 307), (2364,), (592,))

In [4]:
id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)

In [10]:
nfolds = 10
folds = KFold(n_splits=nfolds, shuffle=True, random_state=15)

In [5]:
def train_lgbm(xtrain, ytrain, xtest, ytest, params):
    feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
    mvalid = np.zeros(len(xtrain))
    mfull = np.zeros(len(xtest))
    models = []

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
        print('----')
        print("fold n°{}".format(fold_))

        x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
        x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]

        trn_data = lgb.Dataset(x0, label=y0)
        val_data = lgb.Dataset(x1, label=y1)

        num_round = 10000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=150)

        mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

        feature_importance_df[:, fold_] = clf.feature_importance()

        mfull += clf.predict(xtest,
                             num_iteration=clf.best_iteration) / folds.n_splits

        models.append(clf)

    print(np.sqrt(mean_squared_error(mfull.astype(int), ytest.astype(int))))
    return models, feature_importance_df

In [14]:
params = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",

    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100,
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy
    # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "min_data_in_leaf": 20,
    "max_depth": 7,  # limit the tree depth explicitly.

    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "bagging_seed": 11,

    # For Better Accuracy
    "max_bin": 40,  # lager but slower
    "learning_rate": 0.005,

    # deal with over fitting
    # Use small max_bin
    # Use small num_leaves
    # Use min_data_in_leaf and min_sum_hessian_in_leaf
    # Use bagging by set bagging_fraction and bagging_freq
    # Use feature sub-sampling by set feature_fraction
    # Use bigger training data
    # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
    # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,

    "min_child_samples": 100,

    # other
    "n_estimators": 1500,  # aliases: num_itertations, iteration number or tree number, default 100
    "verbosity": -1,
    "n_jobs": 8,
}

In [15]:
models, feature_importance_df = train_lgbm(
    xtrain, ytrain, xtest, ytest, params)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.31482	valid_1's rmse: 2.97517
[1000]	training's rmse: 1.71141	valid_1's rmse: 2.93083
Early stopping, best iteration is:
[954]	training's rmse: 1.75365	valid_1's rmse: 2.92937
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.29952	valid_1's rmse: 3.31636
[1000]	training's rmse: 1.69656	valid_1's rmse: 3.25221
Early stopping, best iteration is:
[1015]	training's rmse: 1.68349	valid_1's rmse: 3.2514
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.31977	valid_1's rmse: 3.09026
[1000]	training's rmse: 1.70887	valid_1's rmse: 3.04538
Early stopping, best iteration is:
[863]	training's rmse: 1.84514	valid_1's rmse: 3.04178
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.31227	valid_1's rmse: 3.2901
[1000]	training's rmse: 1.70596	val