# Import

In [1]:
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# 数据导入

In [2]:
# data process
path = './Train/'
train_sales = pd.read_csv(path+'train_sales_data.csv')
train_search = pd.read_csv(path+'train_search_data.csv')
train_user = pd.read_csv(path+'train_user_reply_data.csv')
evaluation_public = pd.read_csv(path+'evaluation_public.csv')
submit_example = pd.read_csv(path+'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=[
                  'province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(
    train_sales.drop_duplicates('model').set_index('model')['bodyType'])
# LabelEncoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(
        dict(zip(data[i].unique(), range(data[i].nunique()))))
data['month'] = (data['regYear'] - 2016) * 12 + data['regMonth']

# 数据提取

In [3]:
def get_stat_feature(df_):
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['month']
    for col in tqdm(['label', 'popularity']):
        # shift
        for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col, i))
            df['model_adcode_mt_{}_{}'.format(col, i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col, i))
            df['shift_model_adcode_mt_{}_{}'.format(col, i)] = df['model_adcode_mt'].map(df_last[col])
    print(df)
    return df, stat_feat

In [4]:
# Evaluation index
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(
        lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [5]:
# model selection
def get_model_type(train_x, train_y, valid_x, valid_y, m_type='lgb'):
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
            max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
            n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
        )
        model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (valid_x, valid_y)],
                  categorical_feature=cate_feat,
                  early_stopping_rounds=100, verbose=100)
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
            max_depth=5, learning_rate=0.05, n_estimators=2000,
            objective='reg:gamma', tree_method='hist', subsample=0.9,
            colsample_bytree=0.7, min_child_samples=5, eval_metric='rmse'
        )
        model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (valid_x, valid_y)],
                  early_stopping_rounds=100, verbose=100)
    return model

In [6]:
# model traning
def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx = (df['month'].between(st, m-1))
    train_idx = (df['month'].between(st, m-5))
    valid_idx = (df['month'].between(m-4, m-4))
    test_idx = (df['month'].between(m, m))
    print('all_idx  :', st, m-1)
    print('train_idx:', st, m-5)
    print('valid_idx:', m-4, m-4)
    print('test_idx :', m, m)
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']
    # get model
    model = get_model_type(train_x, train_y, valid_x, valid_y, m_type)
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx])
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]
                  ['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    print('valid mean:', df[valid_idx]['pred_label'].mean())
    print('true  mean:', df[valid_idx]['label'].mean())
    print('test  mean:', df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(
        lambda x: 0 if x < 0 else x).round().astype(int)
    return sub, df[valid_idx]['pred_label']

In [14]:
# forcast
for month in [25, 26, 27, 28]:
    m_type = 'lgb'

    data_df, stat_feat = get_stat_feature(data)

    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))

    features = num_feat + cate_feat
    print(len(features), len(set(features)))

    sub, val_pred = get_train_model(data_df, month, m_type)
    data.loc[(data.regMonth == (month-24)) & (data.regYear == 2018),
             'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth == (month-24)) & (data.regYear == 2018),
             'label'] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth >= 1) & (
    data.regYear == 2018), ['id', 'salesVolume']]
sub.columns = ['id', 'forecastVolum']
sub[['id', 'forecastVolum']].round().astype(
    int).to_csv('CCF_sales.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.27it/s]


29 29
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8380.11	valid_1's l2: 35212.9
[200]	training's l2: 4328.9	valid_1's l2: 32265.1
[300]	training's l2: 2861.84	valid_1's l2: 31264.7
[400]	training's l2: 2057.24	valid_1's l2: 31106.1
[500]	training's l2: 1560	valid_1's l2: 31041.1
[600]	training's l2: 1203.18	valid_1's l2: 30963.3
[700]	training's l2: 955.087	valid_1's l2: 30857.1
[800]	training's l2: 765.785	valid_1's l2: 30823.3
[900]	training's l2: 626.389	valid_1's l2: 30795
[1000]	training's l2: 511.49	valid_1's l2: 30728.6
[1100]	training's l2: 428.885	valid_1's l2: 30684.4
[1200]	training's l2: 358.76	valid_1's l2: 30697.1
Early stopping, best iteration is:
[1103]	training's l2: 426.394	valid_1's l2: 30682.4
0.7529604253264104
valid mean: 596.3846372170245
true  mean: 649.3121212121212
test  mean: 498.74690853249854


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.29it/s]


29 29
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 8823.11	valid_1's l2: 42958.2
[200]	training's l2: 4877.87	valid_1's l2: 42614.3
[300]	training's l2: 3280.94	valid_1's l2: 41902.2
[400]	training's l2: 2383.53	valid_1's l2: 41610
[500]	training's l2: 1837	valid_1's l2: 41473.6
[600]	training's l2: 1444.7	valid_1's l2: 41363
[700]	training's l2: 1157.07	valid_1's l2: 41291.7
Early stopping, best iteration is:
[642]	training's l2: 1319.85	valid_1's l2: 41265.6
0.735888250646995
valid mean: 620.3296661912658
true  mean: 616.5537878787878
test  mean: 332.1100216458767


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.25it/s]


29 29
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 9978.18	valid_1's l2: 31445.4
[200]	training's l2: 5546.82	valid_1's l2: 30963.7
Early stopping, best iteration is:
[143]	training's l2: 7487.27	valid_1's l2: 30638.9
0.7815229792999283
valid mean: 643.6735025167497
true  mean: 673.0143939393939
test  mean: 503.42026906931335


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.15it/s]


29 29
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 10556.9	valid_1's l2: 335121
[200]	training's l2: 6102.63	valid_1's l2: 324613
[300]	training's l2: 4207.54	valid_1's l2: 325303
Early stopping, best iteration is:
[201]	training's l2: 6068.35	valid_1's l2: 324589
0.598443587223902
valid mean: 642.6930256279105
true  mean: 899.8204545454546
test  mean: 497.3168941729662


In [10]:
# forcast
for month in [25, 26, 27, 28]:
    m_type = 'lgb'

    data_df, stat_feat = get_stat_feature(data)

    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))

    features = num_feat + cate_feat
    print(len(features), len(set(features)))

    sub, val_pred = get_train_model(data_df, month, m_type)
    data.loc[(data.regMonth == (month-24)) & (data.regYear == 2018),
             'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth == (month-24)) & (data.regYear == 2018),
             'label'] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth >= 1) & (
    data.regYear == 2018), ['id', 'salesVolume']]
sub.columns = ['id', 'forecastVolum']
sub[['id', 'forecastVolum']].round().astype(
    int).to_csv('CCF_sales.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.28it/s]


       adcode  bodyType  forecastVolum    id  model province  regMonth  \
0      310000         0            NaN     0      0       上海         1   
1      530000         0            NaN     0      0       云南         1   
2      150000         0            NaN     0      0      内蒙古         1   
3      110000         0            NaN     0      0       北京         1   
4      510000         0            NaN     0      0       四川         1   
...       ...       ...            ...   ...    ...      ...       ...   
36955  350000         0            NaN  5364     59       福建         4   
36956  210000         0            NaN  5365     59       辽宁         4   
36957  500000         0            NaN  5366     59       重庆         4   
36958  610000         0            NaN  5367     59       陕西         4   
36959  230000         0            NaN  5368     59      黑龙江         4   

       regYear  salesVolume  popularity  ...  model_adcode_mt_popularity_8  \
0         2016        292.0      

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.39it/s]


       adcode  bodyType  forecastVolum    id  model province  regMonth  \
0      310000         0            NaN     0      0       上海         1   
1      530000         0            NaN     0      0       云南         1   
2      150000         0            NaN     0      0      内蒙古         1   
3      110000         0            NaN     0      0       北京         1   
4      510000         0            NaN     0      0       四川         1   
...       ...       ...            ...   ...    ...      ...       ...   
36955  350000         0            NaN  5364     59       福建         4   
36956  210000         0            NaN  5365     59       辽宁         4   
36957  500000         0            NaN  5366     59       重庆         4   
36958  610000         0            NaN  5367     59       陕西         4   
36959  230000         0            NaN  5368     59      黑龙江         4   

       regYear  salesVolume  popularity  ...  model_adcode_mt_popularity_8  \
0         2016        292.0      

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.38it/s]


       adcode  bodyType  forecastVolum    id  model province  regMonth  \
0      310000         0            NaN     0      0       上海         1   
1      530000         0            NaN     0      0       云南         1   
2      150000         0            NaN     0      0      内蒙古         1   
3      110000         0            NaN     0      0       北京         1   
4      510000         0            NaN     0      0       四川         1   
...       ...       ...            ...   ...    ...      ...       ...   
36955  350000         0            NaN  5364     59       福建         4   
36956  210000         0            NaN  5365     59       辽宁         4   
36957  500000         0            NaN  5366     59       重庆         4   
36958  610000         0            NaN  5367     59       陕西         4   
36959  230000         0            NaN  5368     59      黑龙江         4   

       regYear  salesVolume  popularity  ...  model_adcode_mt_popularity_8  \
0         2016        292.0      

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.38it/s]


       adcode  bodyType  forecastVolum    id  model province  regMonth  \
0      310000         0            NaN     0      0       上海         1   
1      530000         0            NaN     0      0       云南         1   
2      150000         0            NaN     0      0      内蒙古         1   
3      110000         0            NaN     0      0       北京         1   
4      510000         0            NaN     0      0       四川         1   
...       ...       ...            ...   ...    ...      ...       ...   
36955  350000         0            NaN  5364     59       福建         4   
36956  210000         0            NaN  5365     59       辽宁         4   
36957  500000         0            NaN  5366     59       重庆         4   
36958  610000         0            NaN  5367     59       陕西         4   
36959  230000         0            NaN  5368     59      黑龙江         4   

       regYear  salesVolume  popularity  ...  model_adcode_mt_popularity_8  \
0         2016        292.0      