In [2]:
import datetime
import gc

import numpy as np
import pandas as pd

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, RandomizedLasso)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings
import json
warnings.filterwarnings('ignore')

train_df = pd.read_csv(r'../../../Data/train_data.csv')
train_df.head(5)

Unnamed: 0,企业编号,企业总评分,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,纳税A级年份增长1,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,1001,75.374276,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,107.58927,191.707773,63.791689,44.495607,0.151392,414.778035,1089.655763,176.283983,325.371499,1.562757
1,1002,79.830122,2.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,46.903333,56.59,39.83,6.234116,0.023916,6.506667,7.04,5.01,0.702335,0.04533
2,1003,78.318264,2.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,84.275556,139.91,56.02,33.143654,-0.040224,54.918889,75.54,38.01,11.089465,0.031792
3,1004,83.253376,0.0,6.0,1.0,0.0,0.0,2.0,1.0,0.0,...,26.72,35.36,17.29,6.024438,0.081857,6.954444,7.9,6.24,0.618448,0.021711
4,1005,83.291493,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,94.05,110.26,77.85,9.652235,0.012921,108.584444,357.19,44.16,101.728838,0.344086


# Train data

In [3]:
y = train_df[['企业编号', '企业总评分']]
x = train_df.drop(['企业总评分'], axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=31)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((2364, 335), (592, 335), (2364,), (592,))

In [4]:
xtrain.set_index(np.arange(0, len(xtrain)), inplace=True)
xtrain.head(5)
id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)

# Train model

In [5]:
# rf params
rf_params = {}
rf_params['n_estimators'] = 1000
rf_params['max_depth'] = 8
rf_params['min_samples_split'] = 100
rf_params['min_samples_leaf'] = 30

# xgb params
xgb_params = {}
xgb_params['n_estimators'] = 1000
xgb_params['gamm'] = 0.6
xgb_params['min_child_weight'] = 2
xgb_params['learning_rate'] = 0.05
xgb_params['max_depth'] = 10
xgb_params['subsample'] = 0.9
xgb_params['reg_lambda'] = 0.05
xgb_params['reg_alpha'] = 0.05
xgb_params['base_score'] = 0
xgb_params['seed'] = 0
xgb_params['silent'] = 0
xgb_params['n_jobs'] = -1


# lgb params
lgb_params = {}
lgb_params['metric'] = 'rmse'
lgb_params['objective'] = 'regression'
lgb_params['boosting'] = 'gbdt'
lgb_params['n_jobs'] = 8
lgb_params['n_estimators'] = 1000
lgb_params['min_child_samples'] = 100
lgb_params['lambda_l1'] = 0.1
lgb_params['max_bin'] = 7
lgb_params['learning_rate'] = 0.05  # shrinkage_rate     # or 'mae'
lgb_params['bagging_fraction'] = 0.7  # sub_row
lgb_params['bagging_freq'] = 1
lgb_params['num_leaves'] = 80       # num_leaf
lgb_params['min_data'] = 10      # min_data_in_leaf
lgb_params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
lgb_params['verbose'] = -1
lgb_params['feature_fraction'] = 0.8
lgb_params['bagging_seed'] = 11

In [8]:
# XGB model
xgb_model = XGBRegressor(**xgb_params)

# lgb model
lgb_model = LGBMRegressor(**lgb_params)

# RF model
rf_model = RandomForestRegressor(**rf_params)

# ET model
et_model = ExtraTreesRegressor()

# DecsionTree model
dt_model = DecisionTreeRegressor()

# AdaBoost model
ada_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8),
                              n_estimators=550, random_state=np.random.RandomState(1))

In [13]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True,
                           random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        print(S_test.shape)
        
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                print("Fit Model %d fold %d" % (i, j))
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
            

        results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))

        self.stacker.fit(S_train, y)
        print(S_test.shape)
        res = self.stacker.predict(S_test)[:]
        return res

In [14]:
stack = Ensemble(n_splits=5,
                 stacker=LinearRegression(),
                 base_models=(rf_model, xgb_model, lgb_model, et_model, ada_model, dt_model))

In [15]:
y_test = stack.fit_predict(xtrain, ytrain, xtest)
print(np.sqrt(mean_squared_error(y_test.astype(int), ytest.astype(int))),
      r2_score(y_test.astype(int), ytest.astype(int)))

(592, 6)
Fit Model 0 fold 0
Fit Model 0 fold 1
Fit Model 0 fold 2


KeyboardInterrupt: 

In [18]:
stack1 = stack

In [19]:
stacks = [stack1]

# Test data

In [57]:
test_df = pd.read_csv(r'../../../Data/test_data.csv')
X_test = test_df.drop(['企业编号'], axis=1)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test = X_test.fillna(0)
test_id = test_df['企业编号']

In [58]:
pre_test = stack.stacker.predict(X_test)

ValueError: shapes (520,334) and (6,) not aligned: 334 (dim 1) != 6 (dim 0)