In [1]:
import sys
import numpy as np
import pandas as pd
import os 
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [41]:
path  = '../Data/Train/'
train_sales  = pd.read_csv(path+'train_sales_data.csv')
train_search = pd.read_csv(path+'train_search_data.csv')
train_user   = pd.read_csv(path+'train_user_reply_data.csv')
evaluation_public = pd.read_csv(path+'evaluation_public.csv')
submit_example    = pd.read_csv(path+'submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
look1 = train_sales.drop_duplicates('model').set_index('model')
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']
data['model_adcode'] = data['adcode'] + data['model']

In [43]:
def get_stat_feature(df_):   
    df = df_.copy()
    stat_feat = []
    df['model_adcode'] = df['adcode'] + df['model']
    df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
    for col in tqdm(['label','popularity']):
        # shift
        for i in [1,2,3,4,5,6,7,8,9,10,11,12]:
            stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
            df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
            df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
            df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])
    return df,stat_feat

def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred:  list,
        label: [list, 'mean']
    }).reset_index()
    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb'):   
    if m_type == 'lgb':
        model = lgb.LGBMRegressor(
                                num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
                                max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
                                n_estimators=2000, subsample=0.9, colsample_bytree=0.7,
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              categorical_feature=cate_feat, 
              early_stopping_rounds=100, verbose=100)      
    elif m_type == 'xgb':
        model = xgb.XGBRegressor(
                                max_depth=5 , learning_rate=0.05, n_estimators=2000, 
                                objective='reg:gamma', tree_method = 'hist',subsample=0.9, 
                                colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse' 
                                )
        model.fit(train_x, train_y, 
              eval_set=[(train_x, train_y),(valid_x, valid_y)], 
              early_stopping_rounds=100, verbose=100)   
    return model

def get_train_model(df_, m, m_type='lgb'):
    df = df_.copy()
    # 数据集划分
    st = 13
    all_idx   = (df['mt'].between(st , m-1))
    train_idx = (df['mt'].between(st , m-5))
    valid_idx = (df['mt'].between(m-4, m-4))
    test_idx  = (df['mt'].between(m  , m  ))
    print('all_idx  :',st ,m-1)
    print('train_idx:',st ,m-5)
    print('valid_idx:',m-4,m-4)
    print('test_idx :',m  ,m  )  
    # 最终确认
    train_x = df[train_idx][features]
    train_y = df[train_idx]['label']
    valid_x = df[valid_idx][features]
    valid_y = df[valid_idx]['label']   
    # get model
    model = get_model_type(train_x,train_y,valid_x,valid_y,m_type)  
    # offline
    df['pred_label'] = model.predict(df[features])
    best_score = score(df[valid_idx]) 
    # online
    if m_type == 'lgb':
        model.n_estimators = model.best_iteration_ + 100
        model.fit(df[all_idx][features], df[all_idx]['label'], categorical_feature=cate_feat)
    elif m_type == 'xgb':
        model.n_estimators = model.best_iteration + 100
        model.fit(df[all_idx][features], df[all_idx]['label'])
    df['forecastVolum'] = model.predict(df[features]) 
    print('valid mean:',df[valid_idx]['pred_label'].mean())
    print('true  mean:',df[valid_idx]['label'].mean())
    print('test  mean:',df[test_idx]['forecastVolum'].mean())
    # 阶段结果
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)  
    return sub,df[valid_idx]['pred_label']



In [4]:
def trend_factor(data):
    '''计算趋势因子'''
    for col in ['adcode', 'model', 'model_adcode']:
        temp_df = pd.DataFrame(columns=[col, 'factor_{}'.format(col)])
        year_1 = (data['mt'].between(1, 12))
        year_2 = (data['mt'].between(13, 24))
        i = 0
        for df in data[col].unique():
            temp1 = data[(data[col] == df) & (year_1)]
            temp2 = data[(data[col] == df) & (year_2)]
            sum1 = temp1['label'].sum()
            sum2 = temp2['label'].sum()
            factor = sum2 / sum1
            temp_df.loc[i] = {col:df, 'factor_{}'.format(col):factor}
            i = i+1
        data = data.merge(temp_df, how='left', on=[col])
    return data

In [37]:
def average_three(data):
    '''构造该月份对应上一年的该月份，该月份的前一个月和该月份的后一个月的平均值，作为新特征'''
    temp_df = pd.DataFrame(columns=['model_adcode', 'mt', 'aver'])
    j = 0
    for i in tqdm([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]):
        for df in data['model_adcode'].unique():
            if (i == 13):
                temp1 = data[(data['model_adcode'] == df) & (data['mt'] == i-12)].reset_index().loc[0]['label']
                temp2 = data[(data['model_adcode'] == df) & (data['mt'] == i-11)].reset_index().loc[0]['label']
                average = (temp1+temp2)/2
                temp_df.loc[j] = {'model_adcode':df, 'mt':i, 'aver':average}
                j = j+1
            elif (i >= 13):
                temp1 = data[(data['model_adcode'] == df) & (data['mt'] == i-12)].reset_index().loc[0]['label']
                temp2 = data[(data['model_adcode'] == df) & (data['mt'] == i-11)].reset_index().loc[0]['label']
                temp3 = data[(data['model_adcode'] == df) & (data['mt'] == i-13)].reset_index().loc[0]['label']
                average = (temp1+temp2+temp3)/3
                temp_df.loc[j] = {'model_adcode':df, 'mt':i, 'aver':average}
                j = j+1
    data = data.merge(temp_df, how='left', on=['model_adcode', 'mt'])
    return data

In [36]:
data_df, stat_feat = get_stat_feature(data)
print(data_df['label'].max())
#data_df = trend_factor(data_df)
data_df = average_three(data_df)
print(data_df['aver'].max())

100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
  0%|          | 0/16 [00:00<?, ?it/s]

15317.0


  0%|          | 0/16 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [45]:
data = average_three(data)
for month in [25,26,27,28]: 
    m_type = 'xgb' 
    
    data_df, stat_feat = get_stat_feature(data)
    
    num_feat = ['regYear'] + stat_feat
    cate_feat = ['adcode','bodyType','model','regMonth']
    
    if m_type == 'lgb':
        for i in cate_feat:
            data_df[i] = data_df[i].astype('category')
    elif m_type == 'xgb':
        lbl = LabelEncoder()  
        for i in tqdm(cate_feat):
            data_df[i] = lbl.fit_transform(data_df[i].astype(str))
           
    features = num_feat + cate_feat + ['aver']
    print(len(features), len(set(features)))   
    
    sub,val_pred = get_train_model(data_df, month, m_type)
    #将预测出来的结果再重新加入训练文件，以得到下一个月的结果
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
    data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label'      ] = sub['forecastVolum'].values
ratio = trend_factor(data_df)
print('ratio is: ' + str(ratio))
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
#sub['forecastVolum'] = sub['forecastVolum'].apply(lambda x: x * ratio) 
sub[['id','forecastVolum']].round().astype(int).to_csv('../Data/Final/model_2_1_3.csv', index=False)


  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:10<02:40, 10.73s/it][A
 12%|█▎        | 2/16 [00:25<02:48, 12.01s/it][A
 19%|█▉        | 3/16 [00:40<02:48, 12.93s/it][A
 25%|██▌       | 4/16 [00:55<02:42, 13.58s/it][A
 31%|███▏      | 5/16 [01:11<02:34, 14.05s/it][A
 38%|███▊      | 6/16 [01:26<02:24, 14.42s/it][A
 44%|████▍     | 7/16 [01:41<02:12, 14.68s/it][A
 50%|█████     | 8/16 [01:57<01:59, 14.93s/it][A
 56%|█████▋    | 9/16 [02:12<01:45, 15.06s/it][A
 62%|██████▎   | 10/16 [02:28<01:31, 15.23s/it][A
 69%|██████▉   | 11/16 [02:43<01:17, 15.41s/it][A
 75%|███████▌  | 12/16 [02:59<01:02, 15.56s/it][A
 81%|████████▏ | 13/16 [03:15<00:46, 15.62s/it][A
 88%|████████▊ | 14/16 [03:31<00:31, 15.65s/it][A
 94%|█████████▍| 15/16 [03:47<00:15, 15.68s/it][A
100%|██████████| 16/16 [04:03<00:00, 15.22s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  5.10it/s][A
100%|██████████| 2/2 [00:00<00:00,  4.48it/s][A

  0%|    

30 30
all_idx  : 13 24
train_idx: 13 20
valid_idx: 21 21
test_idx : 25 25
[0]	validation_0-rmse:841.37500	validation_1-rmse:1046.80139
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:800.24670	validation_1-rmse:1004.69434
[200]	validation_0-rmse:256.02417	validation_1-rmse:369.53519
[300]	validation_0-rmse:125.80570	validation_1-rmse:213.96837
[400]	validation_0-rmse:112.05924	validation_1-rmse:199.72981
[500]	validation_0-rmse:103.21457	validation_1-rmse:192.66568
[600]	validation_0-rmse:96.24398	validation_1-rmse:190.22621
[700]	validation_0-rmse:89.91720	validation_1-rmse:188.30293
[800]	validation_0-rmse:83.80281	validation_1-rmse:186.06667
[900]	validation_0-rmse:78.84313	validation_1-rmse:183.68703
[1000]	validation_0-rmse:74.04300	validation_1-rmse:182.87848
[1100]	validation_0-rmse:69.03014	validation_1-rmse:182.23479
[1200]	validation_0-rmse:6


  0%|          | 0/2 [00:00<?, ?it/s][A

valid mean: 631.40454
true  mean: 649.3121212121212
test  mean: 477.9709



 50%|█████     | 1/2 [00:00<00:00,  5.49it/s][A
100%|██████████| 2/2 [00:00<00:00,  4.65it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
100%|██████████| 4/4 [00:00<00:00, 19.37it/s][A


30 30
all_idx  : 13 25
train_idx: 13 21
valid_idx: 22 22
test_idx : 26 26
[0]	validation_0-rmse:866.60773	validation_1-rmse:1007.37116
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:825.41510	validation_1-rmse:966.25519
[200]	validation_0-rmse:264.02972	validation_1-rmse:396.90942
[300]	validation_0-rmse:127.85088	validation_1-rmse:296.92798
[400]	validation_0-rmse:116.74348	validation_1-rmse:292.57953
[500]	validation_0-rmse:108.14201	validation_1-rmse:293.36694
Stopping. Best iteration:
[406]	validation_0-rmse:116.31264	validation_1-rmse:291.86627

0.6441218815783107



  0%|          | 0/2 [00:00<?, ?it/s][A

valid mean: 484.3228
true  mean: 616.5537878787878
test  mean: 314.5219



 50%|█████     | 1/2 [00:00<00:00,  3.79it/s][A
100%|██████████| 2/2 [00:00<00:00,  3.54it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
100%|██████████| 4/4 [00:00<00:00, 18.81it/s][A


30 30
all_idx  : 13 26
train_idx: 13 22
valid_idx: 23 23
test_idx : 27 27
[0]	validation_0-rmse:881.69659	validation_1-rmse:1071.88635
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:840.50104	validation_1-rmse:1029.17554
[200]	validation_0-rmse:271.59647	validation_1-rmse:381.16562
[300]	validation_0-rmse:136.51582	validation_1-rmse:243.67372
[400]	validation_0-rmse:123.27693	validation_1-rmse:234.79156
[500]	validation_0-rmse:115.92550	validation_1-rmse:231.23265
[600]	validation_0-rmse:106.87669	validation_1-rmse:227.46771
[700]	validation_0-rmse:100.88274	validation_1-rmse:228.02463
Stopping. Best iteration:
[600]	validation_0-rmse:106.87669	validation_1-rmse:227.46771

0.7366486853548273



  0%|          | 0/2 [00:00<?, ?it/s][A

valid mean: 581.92896
true  mean: 673.0143939393939
test  mean: 422.73953



 50%|█████     | 1/2 [00:00<00:00,  4.69it/s][A
100%|██████████| 2/2 [00:00<00:00,  3.99it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
 50%|█████     | 2/4 [00:00<00:00, 19.68it/s][A
100%|██████████| 4/4 [00:00<00:00, 18.97it/s][A


30 30
all_idx  : 13 27
train_idx: 13 23
valid_idx: 24 24
test_idx : 28 28
[0]	validation_0-rmse:900.64740	validation_1-rmse:1451.30005
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[100]	validation_0-rmse:859.31744	validation_1-rmse:1408.43799
[200]	validation_0-rmse:277.35492	validation_1-rmse:777.16187
[300]	validation_0-rmse:137.35257	validation_1-rmse:707.33368
Stopping. Best iteration:
[236]	validation_0-rmse:165.07248	validation_1-rmse:705.52148

0.4288627206275102
valid mean: 501.36725
true  mean: 899.8204545454546
test  mean: 422.78485
ratio is:        adcode  bodyType  forecastVolum    id  model province  regMonth  \
0           6         0            NaN     0      0       上海         0   
1          20         0            NaN     0      0       云南         0   
2           3         0            NaN     0      0      内蒙古         0   
3           0         0       