In [98]:
from IPython.display import display
import numpy as np
# import modin.pandas as pd
import pandas as pd
import datetime
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE, f_regression

import itertools

import warnings
import json

warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style({'font.sans-serif': ['simsun', 'Arial']})
sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})
%matplotlib inline

# np.random.seed(4590)
data_path = r'./train_data.csv'
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,企业编号,企业总评分,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,资质证书数量,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,1001,75.374276,1.0,1.0,1.0,1.0,2.0,1.0,1.0,9.0,...,107.58927,191.707773,63.791689,44.495607,0.151392,414.778035,1089.655763,176.283983,325.371499,1.562757
1,1002,79.830122,2.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,46.903333,56.59,39.83,6.234116,0.023916,6.506667,7.04,5.01,0.702335,0.04533
2,1003,78.318264,2.0,0.0,1.0,1.0,1.0,0.0,1.0,442.0,...,84.275556,139.91,56.02,33.143654,-0.040224,54.918889,75.54,38.01,11.089465,0.031792
3,1004,83.253376,0.0,6.0,1.0,0.0,0.0,2.0,1.0,1.0,...,26.72,35.36,17.29,6.024438,0.081857,6.954444,7.9,6.24,0.618448,0.021711
4,1005,83.291493,6.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,...,94.05,110.26,77.85,9.652235,0.012921,108.584444,357.19,44.16,101.728838,0.344086


In [99]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=31)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)
feature_name=xtrain.columns.values

(2364, 307) (592, 307) (2364,) (592,)


In [0]:
nfolds = 10
folds = KFold(n_splits=nfolds, shuffle=True, random_state=15)

In [0]:
params = {
    # objective and metric
    "objective": "regression",
    "metric": 'rmse',
    "boosting": "gbdt",

    # for the Leaf-wise (Best-first) Tree
    "num_leaves": 100, 
    # smaller than 2^(max_depth), This is the main parameter to control the complexity of the tree model. With larger can get higher accuracy 
    "min_data_in_leaf": 20, # Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
    "max_depth": 7, # limit the tree depth explicitly.

    # For Faster Speed
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
#     "max_bin": 5, # more small more faster
    "bagging_seed": 11,

    # For Better Accuracy
    "max_bin": 20, # lager but slower
    "learning_rate": 0.005,

    # deal with over fitting
      # Use small max_bin
      # Use small num_leaves
      # Use min_data_in_leaf and min_sum_hessian_in_leaf
      # Use bagging by set bagging_fraction and bagging_freq
      # Use feature sub-sampling by set feature_fraction
      # Use bigger training data
      # Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
      # Try max_depth to avoid growing deep tree
    "feature_fraction": 0.8,
    "lambda_l1": 0.1,

    "min_child_samples": 100,

    # other
    "n_estimators": 1500,
    "verbosity": -1,
    "n_jobs":-1,
}


In [0]:
def train_lgbm(xtrain, ytrain, xtest, ytest, params):
    feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
    mvalid = np.zeros(len(xtrain))
    mfull = np.zeros(len(xtest))
    models = []


    for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
        print('----')
        print("fold n°{}".format(fold_))

        x0, y0 = xtrain.iloc[trn_idx], ytrain.iloc[trn_idx]
        x1, y1 = xtrain.iloc[val_idx], ytrain.iloc[val_idx]

        trn_data = lgb.Dataset(x0, label=y0)
        val_data = lgb.Dataset(x1, label=y1)

        num_round = 10000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=150)

        mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

        feature_importance_df[:, fold_] = clf.feature_importance()

        mfull += clf.predict(xtest,
                             num_iteration=clf.best_iteration) / folds.n_splits
        
        models.append(clf)


    test_error=np.sqrt(mean_squared_error(mfull.astype(int), ytest.astype(int)))
    print()
    print('rmse:', test_error)
    return models, test_error, feature_importance_df

In [35]:
models, test_error, feature_importance_df=train_lgbm(xtrain, ytrain, xtest, ytest, params)

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.22361	valid_1's rmse: 3.51007
[1000]	training's rmse: 1.64857	valid_1's rmse: 3.46088
[1500]	training's rmse: 1.27279	valid_1's rmse: 3.44803
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 1.27279	valid_1's rmse: 3.44803
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.27432	valid_1's rmse: 2.90939
Early stopping, best iteration is:
[806]	training's rmse: 1.86382	valid_1's rmse: 2.86939
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.27339	valid_1's rmse: 3.2435
[1000]	training's rmse: 1.67923	valid_1's rmse: 3.20472
Early stopping, best iteration is:
[861]	training's rmse: 1.81384	valid_1's rmse: 3.20347
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.28578	valid_1's rmse: 3.10941
Early stopping, best 

# Scalling

In [0]:
sc_X = StandardScaler()
xtrain_sc = sc_X.fit_transform(xtrain)
xtest_sc = sc_X.transform(xtest)
xtrain_sc=pd.DataFrame(xtrain_sc)
xtest_sc=pd.DataFrame(xtest_sc)

In [0]:
sc_Y = StandardScaler()
ytrain_sc = ytrain.values.reshape(-1, 1)
ytest_sc = ytest.values.reshape(-1, 1)
ytrain_sc = sc_Y.fit_transform(ytrain_sc)
# ytrain_sc
ytest_sc = sc_Y.transform(ytest_sc)
ytrain_sc=pd.DataFrame(ytrain_sc)
ytest_sc=pd.DataFrame(ytest_sc)

In [102]:
feature_importance_df = np.zeros((xtrain.shape[1], nfolds))
mvalid = np.zeros(len(xtrain))
mfull = np.zeros(len(xtest))


for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain_sc.values, ytrain.values)):
    print('----')
    print("fold n°{}".format(fold_))

    x0, y0 = xtrain_sc.iloc[trn_idx], ytrain.iloc[trn_idx]
    x1, y1 = xtrain_sc.iloc[val_idx], ytrain.iloc[val_idx]

    trn_data = lgb.Dataset(x0, label=y0)
    val_data = lgb.Dataset(x1, label=y1)

    num_round = 10000
    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=150)
    
    mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)

    feature_importance_df[:, fold_] = clf.feature_importance()

    mfull += clf.predict(xtest_sc,
                         num_iteration=clf.best_iteration) / folds.n_splits
    
    
np.sqrt(mean_squared_error(mfull.astype(int), ytest.astype(int)))

----
fold n°0
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.22266	valid_1's rmse: 3.52962
[1000]	training's rmse: 1.64963	valid_1's rmse: 3.47491
Early stopping, best iteration is:
[1150]	training's rmse: 1.52106	valid_1's rmse: 3.46925
----
fold n°1
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.27162	valid_1's rmse: 2.91861
[1000]	training's rmse: 1.66565	valid_1's rmse: 2.87745
Early stopping, best iteration is:
[850]	training's rmse: 1.81013	valid_1's rmse: 2.87358
----
fold n°2
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.26765	valid_1's rmse: 3.25502
Early stopping, best iteration is:
[797]	training's rmse: 1.87535	valid_1's rmse: 3.21849
----
fold n°3
Training until validation scores don't improve for 150 rounds.
[500]	training's rmse: 2.28073	valid_1's rmse: 3.09527
[1000]	training's rmse: 1.70106	valid_1's rmse: 3.05292
Early stopping, best iteration is

3.1108463261409853

In [0]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data

In [0]:
xtrain_th=torch.from_numpy(xtrain_sc.values).float()
xtest_th=torch.from_numpy(xtest_sc.values).float()


# ytrain_sc=ytrain.values.reshape(-1,1)
ytrain_th=torch.from_numpy(ytrain_sc.values).float()
ytest_th=torch.from_numpy(ytest_sc.values).float()

In [0]:
xtrain_th, ytrain_th = Variable(xtrain_th), Variable(ytrain_th)
xtest_th, ytest_th = Variable(xtest_th), Variable(ytest_th)

In [0]:
net = torch.nn.Sequential(
        torch.nn.Linear(xtrain_th.shape[1], 200),
        torch.nn.LeakyReLU(),
        torch.nn.Linear(200, 500),
        torch.nn.LeakyReLU(),
        torch.nn.Linear(500, ytrain_th.shape[1]),
    )

In [0]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.05)
# optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

In [108]:
for t in range(2000):
  
    prediction = net(xtrain_th)     # input x and predict based on x

    loss = loss_func(prediction, ytrain_th)     # must be (1. nn output, 2. target)
    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()
    test_prediction = net(xtest_th)
    test_loss = loss_func(test_prediction, ytest_th)
    if t % 100 ==0:
      print('Iter:',t,'Loss:',np.sqrt(test_loss.data.numpy()))

Iter: 0 Loss: 199.81769
Iter: 100 Loss: 1.7156724
Iter: 200 Loss: 2.2657738
Iter: 300 Loss: 1.9196215
Iter: 400 Loss: 1.4814303
Iter: 500 Loss: 2.9830425
Iter: 600 Loss: 1.7475812
Iter: 700 Loss: 1.8111961
Iter: 800 Loss: 3.2180133
Iter: 900 Loss: 3.6163108
Iter: 1000 Loss: 2.964648
Iter: 1100 Loss: 3.6365278
Iter: 1200 Loss: 3.887578
Iter: 1300 Loss: 3.860841
Iter: 1400 Loss: 2.571096
Iter: 1500 Loss: 2.9154544
Iter: 1600 Loss: 3.3825932
Iter: 1700 Loss: 2.618896
Iter: 1800 Loss: 2.614862
Iter: 1900 Loss: 2.6909013
