In [1]:
import numpy as np
import pandas as pd
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
import datetime
import time
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')




In [2]:
def get_predict_w(model, data, label='label', feature=[], cate_feature=[], random_state=2018, n_splits=5,
                  model_type='lgb'):
    if 'sample_weight' not in data.keys():
        data['sample_weight'] = 1
    model.random_state = random_state
    predict_label = 'predict_' + label
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    data[predict_label] = 0
    test_index = (data[label].isnull()) | (data[label] == -1)
    #~：逐位取反
    train_data = data[~test_index].reset_index(drop=True)
    test_data = data[test_index]

    for train_idx, val_idx in kfold.split(train_data):
        model.random_state = model.random_state + 1

        train_x = train_data.loc[train_idx][feature]
        train_y = train_data.loc[train_idx][label]

        test_x = train_data.loc[val_idx][feature]
        test_y = train_data.loc[val_idx][label]
        if model_type == 'lgb':
            try:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
            except:
                model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                          eval_metric='mae',
                          # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                          # categorical_feature=cate_feature,
                          sample_weight=train_data.loc[train_idx]['sample_weight'],
                          verbose=100)
        elif model_type == 'ctb':
            model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100,
                      # eval_metric='mae',
                      # callbacks=[lgb.reset_parameter(learning_rate=lambda iter: max(0.005, 0.5 * (0.99 ** iter)))],
                      cat_features=cate_feature,
                      sample_weight=train_data.loc[train_idx]['sample_weight'],
                      verbose=100)
            
            
        train_data.loc[val_idx, predict_label] = model.predict(test_x)
        if len(test_data) != 0:
            test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])
    test_data[predict_label] = test_data[predict_label] / n_splits
    # print(mse(train_data[label], train_data[predict_label]) * 5, train_data[predict_label].mean(),
    #       test_data[predict_label].mean())

    return pd.concat([train_data, test_data], sort=True, ignore_index=True), test_data





In [3]:
data_cgb=pd.read_csv('H:/pythonchengx_u/CCFchengche/Date/train_test_o.csv')



In [4]:
data_cgb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36960 entries, 0 to 36959
Data columns (total 8 columns):
adcode             36960 non-null int64
carCommentVolum    36960 non-null float64
model              36960 non-null object
newsReplyVolum     36960 non-null float64
popularity         36960 non-null float64
regMonth           36960 non-null int64
regYear            36960 non-null int64
label              31680 non-null float64
dtypes: float64(4), int64(3), object(1)
memory usage: 2.3+ MB


In [5]:
data_cgb.head()

Unnamed: 0,adcode,carCommentVolum,model,newsReplyVolum,popularity,regMonth,regYear,label
0,310000,11.0,3c974920a76ac9c1,106.0,1479.0,1,2016,292.0
1,530000,11.0,3c974920a76ac9c1,106.0,1594.0,1,2016,466.0
2,150000,11.0,3c974920a76ac9c1,106.0,1479.0,1,2016,257.0
3,110000,11.0,3c974920a76ac9c1,106.0,2370.0,1,2016,408.0
4,510000,11.0,3c974920a76ac9c1,106.0,3562.0,1,2016,610.0


In [6]:
data_cgb.columns

Index(['adcode', 'carCommentVolum', 'model', 'newsReplyVolum', 'popularity',
       'regMonth', 'regYear', 'label'],
      dtype='object')

In [7]:
data_cgb.isna().sum()

adcode                0
carCommentVolum       0
model                 0
newsReplyVolum        0
popularity            0
regMonth              0
regYear               0
label              5280
dtype: int64

In [8]:
features=list(data_cgb.columns)[:-1]

In [9]:
features


['adcode',
 'carCommentVolum',
 'model',
 'newsReplyVolum',
 'popularity',
 'regMonth',
 'regYear']

In [4]:
data=data_cgb
num_feat = ['regMonth', 'regYear', 'popularity', 'carCommentVolum', 'newsReplyVolum']
# cate_feat = ['bodyType', 'model', 'province']
cate_feat = ['adcode', 'model']
# for i in have_null:
#     data[i] = data[i].astype('str')

for i in cate_feat:
    data[i] = data[i].astype('category')
features = num_feat + cate_feat


In [5]:
random_state=2018
if 'sample_weight' not in data.keys():
    data['sample_weight'] = 1
label='label'
predict_label = 'predict_' + label
data[predict_label] = 0
test_index = (data[label].isnull()) | (data[label] == -1)
#~：逐位取反
train_data = data[~test_index].reset_index(drop=True)
test_data = data[test_index]
train_y = train_data.label
train_x = train_data.drop('label', axis=1)


In [12]:
from sklearn.model_selection import GridSearchCV
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=43, max_depth=6,
                              metric='mse', bagging_fraction = 0.8,feature_fraction = 0.8)

params_test1={ 'max_depth': range(3,9,1),  'num_leaves':range(50, 300, 30),'learning_rate ':np.linspace(0.03, 0.3, 10),'min_data_in_leaf':range(10, 150, 20)}
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print(gsearch1.best_params_)
print(gsearch1.best_score_)




Fitting 5 folds for each of 3780 candidates, totalling 18900 fits
{'learning_rate ': 0.03, 'max_depth': 8, 'min_data_in_leaf': 30, 'num_leaves': 230}
-108571.08001906337


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   22.7s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:   49.9s
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 8076 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 9976 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 12076 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done 14376 tasks      | elapsed:  8.1min
[Parallel(n_jobs=4)]: Done 16876 tasks      | elapsed:  9.6min
[Parallel(n_jobs=4)]: Done 18900 out of 18900 | elapsed: 10.8min finished


In [17]:
from sklearn.model_selection import GridSearchCV
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=43, max_depth=6,
                              metric='mse', bagging_fraction = 0.8,feature_fraction = 0.8)

params_test1={ 'max_depth': range(6,9,1),  'num_leaves':range(50, 300, 20),'learning_rate ':np.linspace(0.01, 0.5, 10),'min_data_in_leaf':range(10, 90, 10)}
gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(train_x, train_y)
print(gsearch2.best_params_)
print(gsearch2.best_score_)



Fitting 5 folds for each of 3120 candidates, totalling 15600 fits
{'learning_rate ': 0.01, 'max_depth': 8, 'min_data_in_leaf': 20, 'num_leaves': 170}
-107687.08374737418


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   35.0s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:   56.9s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  3.8min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed:  8.5mi

In [13]:

lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=2000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)






Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 88.0501
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 200.598
[200]	valid_0's l1: 155.185
[300]	valid_0's l1: 131.929
[400]	valid_0's l1: 120.797
[500]	valid_0's l1: 115.168
[600]	valid_0's l1: 109.5
[700]	valid_0's l1: 105.91
[800]	valid_0's l1: 103.584
[900]	v

In [23]:
lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=3000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
[2100]	valid_0's l1: 87.5527
[2200]	valid_0's l1: 87.1427
[2300]	valid_0's l1: 86.8578
[2400]	valid_0's l1: 86.5982
[2500]	valid_0's l1: 86.3142
[2600]	valid_0's l1: 86.0056
[2700]	valid_0's l1: 85.7384
[2800]	valid_0's l1: 85.5153
[2900]	valid_0's l1: 85.1779
[3000]	valid_0's l1: 84.9487
Did not meet early stopping. Best iteration is:
[2999]	valid_0's l1: 84.9455
T

In [24]:
lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=4000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
[2100]	valid_0's l1: 87.5527
[2200]	valid_0's l1: 87.1427
[2300]	valid_0's l1: 86.8578
[2400]	valid_0's l1: 86.5982
[2500]	valid_0's l1: 86.3142
[2600]	valid_0's l1: 86.0056
[2700]	valid_0's l1: 85.7384
[2800]	valid_0's l1: 85.5153
[2900]	valid_0's l1: 85.1779
[3000]	valid_0's l1: 84.9487
[3100]	valid_0's l1: 84.7872
[3200]	valid_0's l1: 84.6499
[3300]	valid_0's l1:

In [25]:
lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=5000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
[2100]	valid_0's l1: 87.5527
[2200]	valid_0's l1: 87.1427
[2300]	valid_0's l1: 86.8578
[2400]	valid_0's l1: 86.5982
[2500]	valid_0's l1: 86.3142
[2600]	valid_0's l1: 86.0056
[2700]	valid_0's l1: 85.7384
[2800]	valid_0's l1: 85.5153
[2900]	valid_0's l1: 85.1779
[3000]	valid_0's l1: 84.9487
[3100]	valid_0's l1: 84.7872
[3200]	valid_0's l1: 84.6499
[3300]	valid_0's l1:

In [27]:
lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=6000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
[2100]	valid_0's l1: 87.5527
[2200]	valid_0's l1: 87.1427
[2300]	valid_0's l1: 86.8578
[2400]	valid_0's l1: 86.5982
[2500]	valid_0's l1: 86.3142
[2600]	valid_0's l1: 86.0056
[2700]	valid_0's l1: 85.7384
[2800]	valid_0's l1: 85.5153
[2900]	valid_0's l1: 85.1779
[3000]	valid_0's l1: 84.9487
[3100]	valid_0's l1: 84.7872
[3200]	valid_0's l1: 84.6499
[3300]	valid_0's l1:

In [28]:
#最优
lgb_model = lgb.LGBMRegressor(min_data_in_leaf= 30,num_leaves=230,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=8, learning_rate=0.03, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=8000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 192.508
[200]	valid_0's l1: 148.641
[300]	valid_0's l1: 129.325
[400]	valid_0's l1: 116.826
[500]	valid_0's l1: 110.223
[600]	valid_0's l1: 105.455
[700]	valid_0's l1: 101.987
[800]	valid_0's l1: 99.3554
[900]	valid_0's l1: 97.001
[1000]	valid_0's l1: 95.469
[1100]	valid_0's l1: 94.2452
[1200]	valid_0's l1: 92.9931
[1300]	valid_0's l1: 91.9281
[1400]	valid_0's l1: 91.3764
[1500]	valid_0's l1: 90.5247
[1600]	valid_0's l1: 89.9381
[1700]	valid_0's l1: 89.4372
[1800]	valid_0's l1: 88.8513
[1900]	valid_0's l1: 88.4013
[2000]	valid_0's l1: 88.0501
[2100]	valid_0's l1: 87.5527
[2200]	valid_0's l1: 87.1427
[2300]	valid_0's l1: 86.8578
[2400]	valid_0's l1: 86.5982
[2500]	valid_0's l1: 86.3142
[2600]	valid_0's l1: 86.0056
[2700]	valid_0's l1: 85.7384
[2800]	valid_0's l1: 85.5153
[2900]	valid_0's l1: 85.1779
[3000]	valid_0's l1: 84.9487
[3100]	valid_0's l1: 84.7872
[3200]	valid_0's l1: 84.6499
[3300]	valid_0's l1:

In [29]:
data_lgb.to_csv("lgb_302300038000.csv")

In [19]:
lgb_model = lgb.LGBMRegressor(learning_rate=0.01,max_depth=8,min_data_in_leaf= 20,num_leaves=170,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
      min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=6000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)




Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 289.58
[200]	valid_0's l1: 218.663
[300]	valid_0's l1: 187.054
[400]	valid_0's l1: 167.299
[500]	valid_0's l1: 155.817
[600]	valid_0's l1: 146.332
[700]	valid_0's l1: 139.155
[800]	valid_0's l1: 132.694
[900]	valid_0's l1: 126.263
[1000]	valid_0's l1: 121.853
[1100]	valid_0's l1: 118.227
[1200]	valid_0's l1: 114.218
[1300]	valid_0's l1: 111.035
[1400]	valid_0's l1: 109.149
[1500]	valid_0's l1: 106.61
[1600]	valid_0's l1: 104.938
[1700]	valid_0's l1: 103.276
[1800]	valid_0's l1: 101.73
[1900]	valid_0's l1: 100.413
[2000]	valid_0's l1: 99.2943
[2100]	valid_0's l1: 98.1729
[2200]	valid_0's l1: 97.1806
[2300]	valid_0's l1: 96.4091
[2400]	valid_0's l1: 95.7046
[2500]	valid_0's l1: 94.9208
[2600]	valid_0's l1: 94.2496
[2700]	valid_0's l1: 93.5496
[2800]	valid_0's l1: 92.9653
[2900]	valid_0's l1: 92.3164
[3000]	valid_0's l1: 91.7224
[3100]	valid_0's l1: 91.2736
[3200]	valid_0's l1: 90.8463
[3300]	valid_0's l1: 

In [26]:
lgb_model = lgb.LGBMRegressor(learning_rate=0.01,max_depth=8,min_data_in_leaf= 20,num_leaves=170,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
      min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=4000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 289.58
[200]	valid_0's l1: 218.663
[300]	valid_0's l1: 187.054
[400]	valid_0's l1: 167.299
[500]	valid_0's l1: 155.817
[600]	valid_0's l1: 146.332
[700]	valid_0's l1: 139.155
[800]	valid_0's l1: 132.694
[900]	valid_0's l1: 126.263
[1000]	valid_0's l1: 121.853
[1100]	valid_0's l1: 118.227
[1200]	valid_0's l1: 114.218
[1300]	valid_0's l1: 111.035
[1400]	valid_0's l1: 109.149
[1500]	valid_0's l1: 106.61
[1600]	valid_0's l1: 104.938
[1700]	valid_0's l1: 103.276
[1800]	valid_0's l1: 101.73
[1900]	valid_0's l1: 100.413
[2000]	valid_0's l1: 99.2943
[2100]	valid_0's l1: 98.1729
[2200]	valid_0's l1: 97.1806
[2300]	valid_0's l1: 96.4091
[2400]	valid_0's l1: 95.7046
[2500]	valid_0's l1: 94.9208
[2600]	valid_0's l1: 94.2496
[2700]	valid_0's l1: 93.5496
[2800]	valid_0's l1: 92.9653
[2900]	valid_0's l1: 92.3164
[3000]	valid_0's l1: 91.7224
[3100]	valid_0's l1: 91.2736
[3200]	valid_0's l1: 90.8463
[3300]	valid_0's l1: 

In [None]:
lgb_model = lgb.LGBMRegressor(learning_rate=0.01,max_depth=8,min_data_in_leaf= 20,num_leaves=170,
    reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
      min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=2000
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


In [9]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.12, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 146.754
[200]	valid_0's l1: 119.417
[300]	valid_0's l1: 108.279
[400]	valid_0's l1: 101.885
[500]	valid_0's l1: 99.0229
[600]	valid_0's l1: 96.6566
[700]	valid_0's l1: 94.9533
[800]	valid_0's l1: 93.7366
[900]	valid_0's l1: 92.5498
[1000]	valid_0's l1: 92.159
Did not meet early stopping. Best iteration is:
[997]	valid_0's l1: 92.1135
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 153.698
[200]	valid_0's l1: 127.265
[300]	valid_0's l1: 114.132
[400]	valid_0's l1: 108.801
[500]	valid_0's l1: 106.314
[600]	valid_0's l1: 103.299
[700]	valid_0's l1: 101.333
[800]	valid_0's l1: 100.166
[900]	valid_0's l1: 99.3506
[1000]	valid_0's l1: 98.358
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 98.358
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 155.163
[200]	valid_0's l1: 124.54
[300]	valid_0's l1: 114.933
[400]	valid_0's 

In [8]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,num_iterations=1000
)
data_lgb_005, test_data_lgb_005 = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 181.869
[200]	valid_0's l1: 145.78
[300]	valid_0's l1: 128.981
[400]	valid_0's l1: 117.787
[500]	valid_0's l1: 111.537
[600]	valid_0's l1: 107.239
[700]	valid_0's l1: 104.001
[800]	valid_0's l1: 101.043
[900]	valid_0's l1: 98.3299
[1000]	valid_0's l1: 96.5694
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 96.5694
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 190.309
[200]	valid_0's l1: 153.94
[300]	valid_0's l1: 133.423
[400]	valid_0's l1: 122.904
[500]	valid_0's l1: 117.303
[600]	valid_0's l1: 111.885
[700]	valid_0's l1: 107.965
[800]	valid_0's l1: 105.746
[900]	valid_0's l1: 103.45
[1000]	valid_0's l1: 101.392
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 101.392
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 193.648
[200]	valid_0's l1: 153.002
[300]	valid_0's l1: 136.206
[400]	valid_0'

In [14]:
data_lgb.head()

Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,344.319093,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,481.212134,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,183.70624,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,434.470023,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,637.968153,1,2016,1


In [27]:

lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=0., reg_lambda=0.01, objective='mse', metric='mae',
    max_depth=-1, learning_rate=0.05, min_child_samples=20,
    n_estimators=1000, subsample=0.7, colsample_bytree=0.7, subsample_freq=1
)
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 146.754
[200]	valid_0's l1: 119.417
[300]	valid_0's l1: 108.279
[400]	valid_0's l1: 101.885
[500]	valid_0's l1: 99.0229
[600]	valid_0's l1: 96.6566
[700]	valid_0's l1: 94.9533
[800]	valid_0's l1: 93.7366
[900]	valid_0's l1: 92.5498
[1000]	valid_0's l1: 92.159
Did not meet early stopping. Best iteration is:
[997]	valid_0's l1: 92.1135
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 153.698
[200]	valid_0's l1: 127.265
[300]	valid_0's l1: 114.132
[400]	valid_0's l1: 108.801
[500]	valid_0's l1: 106.314
[600]	valid_0's l1: 103.299
[700]	valid_0's l1: 101.333
[800]	valid_0's l1: 100.166
[900]	valid_0's l1: 99.3506
[1000]	valid_0's l1: 98.358
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 98.358
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 155.163
[200]	valid_0's l1: 124.54
[300]	valid_0's l1: 114.933
[400]	valid_0's 

In [32]:
lgb_model = lgb.LGBMRegressor()
data_lgb, predict_label = get_predict_w(lgb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019, n_splits=5)



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 49308.5	valid_0's l1: 120.979
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 49308.5	valid_0's l1: 120.979
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 53485.4	valid_0's l1: 123.001
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 53485.4	valid_0's l1: 123.001
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 56174.5	valid_0's l1: 125.9
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 56174.5	valid_0's l1: 125.9
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 51288.4	valid_0's l1: 124.088
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 51288.4	valid_0's l1: 124.088
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 53077.6	valid_0's l1: 124.628
Did not meet early stopping. Best iteration is:

In [17]:
data_lgb.head()


Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight,lgb
0,310000,11.0,395,3c974920a76ac9c1,106.0,1479.0,395.156677,1,2016,1,395.156677
1,530000,11.0,414,3c974920a76ac9c1,106.0,1594.0,413.963795,1,2016,1,413.963795
2,150000,11.0,215,3c974920a76ac9c1,106.0,1479.0,215.188753,1,2016,1,215.188753
3,110000,11.0,556,3c974920a76ac9c1,106.0,2370.0,555.716926,1,2016,1,555.716926
4,510000,11.0,559,3c974920a76ac9c1,106.0,3562.0,558.700597,1,2016,1,558.700597


In [28]:
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as ctb
ctb_model = CatBoostRegressor()
ctb_model = CatBoostRegressor(iterations=1000,learning_rate=0.05, depth=7, loss_function='MAE', 
                          eval_metric='MAE', random_seed=1)

data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')



0:	learn: 612.4635290	test: 593.6865009	best: 593.6865009 (0)	total: 18.8ms	remaining: 18.8s
100:	learn: 612.3638943	test: 593.5868696	best: 593.5868696 (100)	total: 3.08s	remaining: 27.4s
200:	learn: 612.2641938	test: 593.4871703	best: 593.4871703 (200)	total: 5.76s	remaining: 22.9s
300:	learn: 612.1644036	test: 593.3873803	best: 593.3873803 (300)	total: 7.85s	remaining: 18.2s
400:	learn: 612.0645259	test: 593.2875030	best: 593.2875030 (400)	total: 9.78s	remaining: 14.6s
500:	learn: 611.9646555	test: 593.1876332	best: 593.1876332 (500)	total: 11.5s	remaining: 11.4s
600:	learn: 611.8648145	test: 593.0877918	best: 593.0877918 (600)	total: 13.3s	remaining: 8.81s
700:	learn: 611.7649321	test: 592.9879075	best: 592.9879075 (700)	total: 15s	remaining: 6.38s
800:	learn: 611.6651283	test: 592.8880826	best: 592.8880826 (800)	total: 17.1s	remaining: 4.26s
900:	learn: 611.5652777	test: 592.7882273	best: 592.7882273 (900)	total: 19s	remaining: 2.09s
999:	learn: 611.4664297	test: 592.6893576	best:

In [None]:
data_ctb['ctb'] = data_ctb[predict_label]
data_ctb['label'] = data_ctb['ctb'].apply(lambda x: 0 if x < 0 else x)
data_ctb[data_ctb.label.isnull()][['label']].round().astype(int).to_csv('ccf_car_sales_ctb0902.csv', index=False)




In [29]:
data_ctb.head(20)

Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,0.998146,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,0.998163,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,0.998291,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,0.997962,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,0.997753,1,2016,1
5,340000,11.0,206.0,3c974920a76ac9c1,106.0,1314.0,0.998365,1,2016,1
6,370000,11.0,503.0,3c974920a76ac9c1,106.0,3476.0,0.9976,1,2016,1
7,140000,11.0,236.0,3c974920a76ac9c1,106.0,1422.0,0.998086,1,2016,1
8,440000,11.0,3635.0,3c974920a76ac9c1,106.0,7182.0,0.997301,1,2016,1
9,450000,11.0,450.0,3c974920a76ac9c1,106.0,1163.0,0.998304,1,2016,1


In [30]:
ctb_model = CatBoostRegressor(iterations=1000,learning_rate=0.1, depth=7, loss_function='MAE', 
                          eval_metric='MAE', random_seed=1)

data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')

0:	learn: 612.4145754	test: 593.6375469	best: 593.6375469 (0)	total: 17.4ms	remaining: 17.4s
100:	learn: 607.4342303	test: 588.6572326	best: 588.6572326 (100)	total: 2.99s	remaining: 26.6s
200:	learn: 602.4486790	test: 583.6730899	best: 583.6730899 (200)	total: 4.85s	remaining: 19.3s
300:	learn: 597.4822572	test: 578.7117534	best: 578.7117534 (300)	total: 6s	remaining: 13.9s
400:	learn: 592.5571579	test: 573.7962434	best: 573.7962434 (400)	total: 7.53s	remaining: 11.2s
500:	learn: 587.6996247	test: 568.9532596	best: 568.9532596 (500)	total: 9.5s	remaining: 9.46s
600:	learn: 582.9034521	test: 564.1734913	best: 564.1734913 (600)	total: 11.8s	remaining: 7.82s
700:	learn: 578.1874071	test: 559.4736537	best: 559.4736537 (700)	total: 14.2s	remaining: 6.07s
800:	learn: 573.5570519	test: 554.8572579	best: 554.8572579 (800)	total: 16.9s	remaining: 4.21s
900:	learn: 569.0057278	test: 550.3279108	best: 550.3279108 (900)	total: 19.9s	remaining: 2.18s
999:	learn: 564.5850359	test: 545.9384513	best:

In [31]:
data_ctb.head(20)

Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,48.400565,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,48.544238,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,48.398797,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,49.031031,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,48.906963,1,2016,1
5,340000,11.0,206.0,3c974920a76ac9c1,106.0,1314.0,48.362881,1,2016,1
6,370000,11.0,503.0,3c974920a76ac9c1,106.0,3476.0,48.973518,1,2016,1
7,140000,11.0,236.0,3c974920a76ac9c1,106.0,1422.0,48.391441,1,2016,1
8,440000,11.0,3635.0,3c974920a76ac9c1,106.0,7182.0,49.170427,1,2016,1
9,450000,11.0,450.0,3c974920a76ac9c1,106.0,1163.0,48.274277,1,2016,1


In [16]:
from catboost import CatBoostRegressor, CatBoostClassifier
ctb_model = CatBoostRegressor()
data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')

0:	learn: 979.7329647	test: 926.5130836	best: 926.5130836 (0)	total: 89.7ms	remaining: 1m 29s
100:	learn: 441.7731645	test: 400.7756110	best: 400.7756110 (100)	total: 3.52s	remaining: 31.4s
200:	learn: 391.3831055	test: 353.5643746	best: 353.5643746 (200)	total: 6.77s	remaining: 26.9s
300:	learn: 368.7231480	test: 336.0041145	best: 336.0041145 (300)	total: 9.72s	remaining: 22.6s
400:	learn: 347.6329050	test: 320.2515051	best: 320.2515051 (400)	total: 12.8s	remaining: 19.2s
500:	learn: 330.8920428	test: 308.7225969	best: 308.7225969 (500)	total: 16.2s	remaining: 16.1s
600:	learn: 319.0123137	test: 300.1165721	best: 300.1165721 (600)	total: 19.4s	remaining: 12.9s
700:	learn: 311.1466984	test: 293.6337677	best: 293.6337677 (700)	total: 22.6s	remaining: 9.64s
800:	learn: 306.1443342	test: 289.6560534	best: 289.6553129 (798)	total: 25.6s	remaining: 6.35s
900:	learn: 299.3733776	test: 284.1340223	best: 284.1340223 (900)	total: 28.8s	remaining: 3.17s
999:	learn: 294.7170611	test: 280.7798923	

In [36]:
data_ctb.head(20)



Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,287.396569,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,341.227381,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,240.915092,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,479.512411,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,549.619826,1,2016,1
5,340000,11.0,206.0,3c974920a76ac9c1,106.0,1314.0,297.182276,1,2016,1
6,370000,11.0,503.0,3c974920a76ac9c1,106.0,3476.0,782.533868,1,2016,1
7,140000,11.0,236.0,3c974920a76ac9c1,106.0,1422.0,231.177955,1,2016,1
8,440000,11.0,3635.0,3c974920a76ac9c1,106.0,7182.0,2110.017264,1,2016,1
9,450000,11.0,450.0,3c974920a76ac9c1,106.0,1163.0,382.200722,1,2016,1


In [20]:
ctb_model = CatBoostRegressor(iterations=5000,learning_rate=0.5)
data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')




0:	learn: 713.5941974	test: 661.3735910	best: 661.3735910 (0)	total: 27.4ms	remaining: 2m 17s
100:	learn: 300.2462866	test: 295.4223429	best: 295.3888800 (97)	total: 3.2s	remaining: 2m 35s
200:	learn: 270.7007834	test: 276.3965597	best: 276.3965597 (200)	total: 6.14s	remaining: 2m 26s
300:	learn: 259.7369584	test: 271.0654472	best: 271.0492313 (296)	total: 9.15s	remaining: 2m 22s
400:	learn: 252.8090575	test: 268.4517404	best: 268.2642046 (395)	total: 12.1s	remaining: 2m 18s
500:	learn: 246.4807191	test: 265.7575378	best: 265.7575378 (500)	total: 15.1s	remaining: 2m 15s
600:	learn: 241.8586446	test: 262.9532847	best: 262.9089871 (591)	total: 18.1s	remaining: 2m 12s
700:	learn: 237.7434162	test: 260.4133070	best: 260.4133070 (700)	total: 21.2s	remaining: 2m 9s
800:	learn: 235.3329284	test: 258.7027886	best: 258.7027886 (800)	total: 24.3s	remaining: 2m 7s
900:	learn: 232.9879971	test: 257.4943543	best: 257.3796727 (886)	total: 27.1s	remaining: 2m 3s
1000:	learn: 230.7298908	test: 256.955

In [21]:

data_ctb.head(20)

Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,278.610586,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,328.278765,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,336.770168,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,416.279245,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,644.360352,1,2016,1
5,340000,11.0,206.0,3c974920a76ac9c1,106.0,1314.0,307.790797,1,2016,1
6,370000,11.0,503.0,3c974920a76ac9c1,106.0,3476.0,851.798713,1,2016,1
7,140000,11.0,236.0,3c974920a76ac9c1,106.0,1422.0,223.184129,1,2016,1
8,440000,11.0,3635.0,3c974920a76ac9c1,106.0,7182.0,3049.835593,1,2016,1
9,450000,11.0,450.0,3c974920a76ac9c1,106.0,1163.0,477.047839,1,2016,1


In [24]:
predict_label

'predict_label'

In [25]:
ctb_model = CatBoostRegressor(iterations=5000,learning_rate=0.7,loss_function="RMSE")
data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')

0:	learn: 634.9824646	test: 582.5155087	best: 582.5155087 (0)	total: 28.1ms	remaining: 2m 20s
100:	learn: 299.0629869	test: 302.5374598	best: 302.5374598 (100)	total: 3.29s	remaining: 2m 39s
200:	learn: 277.4907939	test: 289.3109838	best: 289.0505088 (198)	total: 6.47s	remaining: 2m 34s
300:	learn: 264.2614407	test: 280.9830991	best: 280.9830991 (300)	total: 9.65s	remaining: 2m 30s
400:	learn: 256.1606035	test: 276.7250955	best: 276.5731914 (393)	total: 12.6s	remaining: 2m 24s
500:	learn: 248.3319309	test: 273.8249722	best: 273.8249722 (500)	total: 15.7s	remaining: 2m 20s
600:	learn: 244.9248474	test: 272.3291910	best: 271.9952956 (585)	total: 19s	remaining: 2m 18s
700:	learn: 239.3857882	test: 269.2608605	best: 268.8302737 (677)	total: 22s	remaining: 2m 14s
800:	learn: 236.4137937	test: 265.9622649	best: 265.9622649 (800)	total: 25.1s	remaining: 2m 11s
900:	learn: 233.2140266	test: 265.4514646	best: 265.3418334 (880)	total: 28.1s	remaining: 2m 7s
1000:	learn: 230.1258587	test: 263.446

In [27]:
ctb_model = CatBoostRegressor(iterations=5000,learning_rate=0.15,loss_function="RMSE")
data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')


0:	learn: 904.3979866	test: 851.7783514	best: 851.7783514 (0)	total: 27.2ms	remaining: 2m 16s
100:	learn: 331.0171173	test: 312.0127987	best: 312.0127987 (100)	total: 3.23s	remaining: 2m 36s
200:	learn: 302.2514983	test: 290.7868178	best: 290.7868178 (200)	total: 6.29s	remaining: 2m 30s
300:	learn: 289.6024343	test: 283.0084528	best: 283.0084528 (300)	total: 9.06s	remaining: 2m 21s
400:	learn: 278.5744090	test: 275.6575401	best: 275.6549918 (399)	total: 12.1s	remaining: 2m 19s
500:	learn: 270.8127408	test: 270.5724197	best: 270.5724197 (500)	total: 15.2s	remaining: 2m 16s
600:	learn: 265.9440541	test: 266.8965452	best: 266.8965452 (600)	total: 18.5s	remaining: 2m 15s
700:	learn: 261.8081536	test: 264.2594461	best: 264.2541673 (697)	total: 21.3s	remaining: 2m 10s
800:	learn: 256.3995947	test: 259.4613957	best: 259.4613957 (800)	total: 24.3s	remaining: 2m 7s
900:	learn: 252.2560722	test: 257.4346552	best: 257.3799495 (896)	total: 27.2s	remaining: 2m 3s
1000:	learn: 249.2784578	test: 255.

In [28]:
ctb_model = CatBoostRegressor(iterations=10000,learning_rate=0.05,loss_function="RMSE")
data_ctb, predict_label = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')



0:	learn: 966.9229081	test: 913.8197712	best: 913.8197712 (0)	total: 29.9ms	remaining: 4m 59s
100:	learn: 403.4534404	test: 365.7935225	best: 365.7935225 (100)	total: 3.66s	remaining: 5m 58s
200:	learn: 359.6144306	test: 331.1568133	best: 331.1568133 (200)	total: 6.8s	remaining: 5m 31s
300:	learn: 331.1049800	test: 311.5698898	best: 311.5698898 (300)	total: 10.3s	remaining: 5m 31s
400:	learn: 312.6379623	test: 298.2009619	best: 298.2009619 (400)	total: 13.9s	remaining: 5m 31s
500:	learn: 302.8842061	test: 291.1307435	best: 291.1062395 (499)	total: 17.1s	remaining: 5m 24s
600:	learn: 294.2406591	test: 284.4298345	best: 284.4298321 (599)	total: 20.6s	remaining: 5m 22s
700:	learn: 288.3283389	test: 280.0320236	best: 280.0257096 (699)	total: 23.7s	remaining: 5m 14s
800:	learn: 284.5178030	test: 277.7651874	best: 277.7649971 (799)	total: 26.5s	remaining: 5m 4s
900:	learn: 279.1784628	test: 274.9506737	best: 274.9481866 (896)	total: 29.7s	remaining: 4m 59s
1000:	learn: 277.1929276	test: 273.

In [29]:
ctb_model = CatBoostRegressor(iterations=20000,learning_rate=0.01,loss_function="RMSE")
data_ctb_01, predict_label_01 = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')


0:	learn: 992.6355976	test: 939.2924841	best: 939.2924841 (0)	total: 33ms	remaining: 11m
100:	learn: 618.2645786	test: 562.0726875	best: 562.0726875 (100)	total: 3.54s	remaining: 11m 37s
200:	learn: 501.1368020	test: 457.0353068	best: 457.0353068 (200)	total: 7.22s	remaining: 11m 50s
300:	learn: 441.0289974	test: 400.3376937	best: 400.3376937 (300)	total: 10.7s	remaining: 11m 38s
400:	learn: 416.2488872	test: 376.7771274	best: 376.7771274 (400)	total: 14.2s	remaining: 11m 34s
500:	learn: 401.2810894	test: 362.9771509	best: 362.9771509 (500)	total: 17.7s	remaining: 11m 30s
600:	learn: 388.6221696	test: 352.2661751	best: 352.2661751 (600)	total: 21.1s	remaining: 11m 21s
700:	learn: 379.5138418	test: 344.6741474	best: 344.6741474 (700)	total: 24.6s	remaining: 11m 17s
800:	learn: 372.8416613	test: 339.0643752	best: 339.0643752 (800)	total: 28.1s	remaining: 11m 14s
900:	learn: 366.2750687	test: 334.1356729	best: 334.1356729 (900)	total: 31.4s	remaining: 11m 6s
1000:	learn: 358.9171954	test:

In [31]:
ctb_model = CatBoostRegressor(iterations=30000,learning_rate=0.03,loss_function="RMSE")
data_ctb_03, predict_label_03 = get_predict_w(ctb_model, data, label='label',
                                    feature=features, cate_feature=cate_feat,
                                    random_state=2019,model_type='ctb')


0:	learn: 979.7329647	test: 926.5130836	best: 926.5130836 (0)	total: 29.5ms	remaining: 14m 43s
100:	learn: 441.7731645	test: 400.7756110	best: 400.7756110 (100)	total: 3.32s	remaining: 16m 23s
200:	learn: 391.3831055	test: 353.5643746	best: 353.5643746 (200)	total: 6.58s	remaining: 16m 15s
300:	learn: 368.7231480	test: 336.0041145	best: 336.0041145 (300)	total: 9.7s	remaining: 15m 57s
400:	learn: 347.6329050	test: 320.2515051	best: 320.2515051 (400)	total: 13.1s	remaining: 16m 9s
500:	learn: 330.8920428	test: 308.7225969	best: 308.7225969 (500)	total: 16.5s	remaining: 16m 12s
600:	learn: 319.0123137	test: 300.1165721	best: 300.1165721 (600)	total: 19.8s	remaining: 16m 9s
700:	learn: 311.1466984	test: 293.6337677	best: 293.6337677 (700)	total: 23.4s	remaining: 16m 18s
800:	learn: 306.1443342	test: 289.6560534	best: 289.6553129 (798)	total: 26.5s	remaining: 16m 4s
900:	learn: 299.3733776	test: 284.1340223	best: 284.1340223 (900)	total: 29.9s	remaining: 16m 5s
1000:	learn: 294.6727060	tes

In [32]:
data_ctb_03.head()



Unnamed: 0,adcode,carCommentVolum,label,model,newsReplyVolum,popularity,predict_label,regMonth,regYear,sample_weight
0,310000,11.0,292.0,3c974920a76ac9c1,106.0,1479.0,322.079223,1,2016,1
1,530000,11.0,466.0,3c974920a76ac9c1,106.0,1594.0,376.656707,1,2016,1
2,150000,11.0,257.0,3c974920a76ac9c1,106.0,1479.0,253.657893,1,2016,1
3,110000,11.0,408.0,3c974920a76ac9c1,106.0,2370.0,522.136991,1,2016,1
4,510000,11.0,610.0,3c974920a76ac9c1,106.0,3562.0,663.364711,1,2016,1


In [34]:
predict_label_03



'predict_label'

In [35]:
data_ctb_03.to_csv("ctb03.csv")