In [1]:
import sys
import numpy as np
import pandas as pd
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [17]:
path  = '../Data/Train/'
train_sales  = pd.read_csv(path+'train_sales_data.csv')
train_search = pd.read_csv(path+'train_search_data.csv')
train_user   = pd.read_csv(path+'train_user_reply_data.csv')
evaluation_public = pd.read_csv(path+'evaluation_public.csv')
submit_example    = pd.read_csv(path+'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
look1 = train_sales.drop_duplicates('model').set_index('model')
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

In [18]:
def get_stat_feature(df_):   
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] / 100 + df['mt']
    for col in tqdm(['label','popularity', 'newsReplyVolum', 'carCommentVolum']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])    
    return df,stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    return sub,df[valid_idx]['pred_label']



In [20]:
for month in [25,26,27,28]: 
    m_type = 'xgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values	
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).to_csv('CCF_sales_xgb_try.csv', index=False)

100%|██████████| 4/4 [00:01<00:00,  3.50it/s]
100%|██████████| 4/4 [00:00<00:00, 20.30it/s]


53 53
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:841.37500	validation_1-rmse:1046.80139
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:800.24805	validation_1-rmse:1004.64282
[200]	validation_0-rmse:253.39433	validation_1-rmse:372.38864
[300]	validation_0-rmse:114.49166	validation_1-rmse:220.34698
[400]	validation_0-rmse:99.50015	validation_1-rmse:212.14720
[500]	validation_0-rmse:91.10363	validation_1-rmse:208.58571
[600]	validation_0-rmse:84.65532	validation_1-rmse:205.86395
[700]	validation_0-rmse:78.55547	validation_1-rmse:204.96515
[800]	validation_0-rmse:73.73212	validation_1-rmse:204.83130
Stopping. Best iteration:
[721]	validation_0-rmse:77.47983	validation_1-rmse:204.62773

0.7657544509760641


 25%|██▌       | 1/4 [00:00<00:00,  5.79it/s]

valid mean: 623.69904
true  mean: 649.3121212121212
test  mean: 456.784


100%|██████████| 4/4 [00:01<00:00,  3.47it/s]
100%|██████████| 4/4 [00:00<00:00, 19.93it/s]


53 53
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:866.60773	validation_1-rmse:1007.37116
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:825.41480	validation_1-rmse:966.16974
[200]	validation_0-rmse:262.70447	validation_1-rmse:402.34760
[300]	validation_0-rmse:117.48153	validation_1-rmse:306.34750
[400]	validation_0-rmse:103.20481	validation_1-rmse:299.37936
[500]	validation_0-rmse:94.61218	validation_1-rmse:296.40799
[600]	validation_0-rmse:88.12896	validation_1-rmse:293.99170
[700]	validation_0-rmse:82.11618	validation_1-rmse:293.63409
[800]	validation_0-rmse:77.38625	validation_1-rmse:292.60638
[900]	validation_0-rmse:72.71927	validation_1-rmse:293.36060
Stopping. Best iteration:
[800]	validation_0-rmse:77.38625	validation_1-rmse:292.60638

0.6258320756572351


  0%|          | 0/4 [00:00<?, ?it/s]

valid mean: 462.27182
true  mean: 616.5537878787878
test  mean: 290.6545


100%|██████████| 4/4 [00:01<00:00,  2.74it/s]
100%|██████████| 4/4 [00:00<00:00, 17.24it/s]


53 53
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:881.69659	validation_1-rmse:1071.88635
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:840.50262	validation_1-rmse:1029.28833
[200]	validation_0-rmse:270.21490	validation_1-rmse:393.94186
[300]	validation_0-rmse:123.25295	validation_1-rmse:257.29971
[400]	validation_0-rmse:103.93626	validation_1-rmse:246.46590
[500]	validation_0-rmse:95.43678	validation_1-rmse:243.10078
[600]	validation_0-rmse:88.66101	validation_1-rmse:241.95264
[700]	validation_0-rmse:83.81142	validation_1-rmse:240.47372
[800]	validation_0-rmse:78.89437	validation_1-rmse:239.96500
Stopping. Best iteration:
[739]	validation_0-rmse:81.42223	validation_1-rmse:239.33704

0.7244271565588031


  0%|          | 0/4 [00:00<?, ?it/s]

valid mean: 568.633
true  mean: 673.0143939393939
test  mean: 343.50113


100%|██████████| 4/4 [00:01<00:00,  2.76it/s]
100%|██████████| 4/4 [00:00<00:00, 19.30it/s]


53 53
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:900.64740	validation_1-rmse:1451.30005
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:859.31818	validation_1-rmse:1408.47778
[200]	validation_0-rmse:275.60077	validation_1-rmse:771.66724
[300]	validation_0-rmse:125.00137	validation_1-rmse:687.82886
[400]	validation_0-rmse:106.63680	validation_1-rmse:689.07733
Stopping. Best iteration:
[329]	validation_0-rmse:117.62277	validation_1-rmse:684.66028

0.44141680989529486
valid mean: 517.17346
true  mean: 899.8204545454546
test  mean: 315.39142
