In [185]:
import time
import math
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [202]:
def data_preprocessing(data):
    data['date'] = list(map(lambda x, y: str(x) + '.' + str(y), data['regYear'], data['regMonth']))
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].apply(lambda x: x.year)
    data['month'] = data['date'].apply(lambda x: x.month)
    data['time_id'] = list(map(lambda x, y: (x-2016)*12 + y, data['year'], data['month']))
    data.drop(['regYear', 'regMonth', 'date'], axis=1, inplace=True)
    
    if 'forecastVolum' in list(data.columns):
        data.drop(['forecastVolum'], axis=1, inplace=True)
    if 'province' in list(data.columns):
        pro_label = dict(zip(sorted(list(set(data['province']))), range(0, len(set(data['province'])))))
        data['pro_id'] = data['province'].map(pro_label)
        data.drop(['province', 'adcode'], axis=1, inplace=True)
    if 'bodyType' in list(data.columns):
        body_label = dict(zip(sorted(list(set(data['bodyType']))), range(0, len(set(data['bodyType'])))))
        data['body_id'] = data['bodyType'].map(body_label)
        data = data.drop('bodyType', axis=1)
    model_label = dict(zip(sorted(list(set(data['model']))), range(0, len(set(data['model'])))))
    data['model_id'] = data['model'].map(model_label)
    data.drop('model', axis=1, inplace=True)
    
    return data


def feature_engineering(data, month):
    df = data.copy()
    features = []
    start = int((month - 24) / 3) * 2
    start += int((month - 24) / 4)
    start = start -1 if start >= 1 else start
    
    data['is_CNY'] = data['time_id'].apply(lambda x: 1 if x==2 or x==13 or x==26 else 0)
    data['is_before_CNY'] = data['time_id'].apply(lambda x: 1 if x==1 or x==12 or x==25 else 0)
    data['is_after_CNY'] = data['time_id'].apply(lambda x: 1 if x==3 or x==14 or x==27 else 0)
    CNY_features = ['is_CNY', 'is_before_CNY', 'is_after_CNY']
    features = features + CNY_features
    
    # history sales
    for last in range(1, 17):
        tmp = data.copy()
        tmp['time_id'] = tmp['time_id'].apply(lambda x: x+start+last if x+start+last<=28 else -1) 
        tmp = tmp[tmp['time_id']!=-1][['time_id', 'pro_id', 'model_id', 'body_id', 'label']]
        tmp = tmp.rename(columns={'label': 'last_{}_months_sales'.format(last)})
        data = pd.merge(data, tmp, how='left', on=['time_id', 'pro_id', 'model_id', 'body_id'])
        if last <= 6:
            features.append('last_{}_months_sales'.format(last))
    
    # history popularity
    for last in range(1, 17):
        tmp = data.copy()
        tmp['time_id'] = tmp['time_id'].apply(lambda x: x+start+last if x+start+last<=28 else -1)
        tmp=tmp[tmp['time_id']!=-1][['time_id', 'pro_id', 'model_id', 'body_id', 'popularity']]
        tmp = tmp.rename(columns={'popularity': 'last_{}_months_popularity'.format(last)})
        data = pd.merge(data, tmp, how='left', on=['time_id', 'pro_id', 'model_id', 'body_id'])
        if last < 6 or (last >= 11 and last <= 13):
            features.append('last_{}_months_popularity'.format(last))
            
    # half year sales
    data['half_year_sales_sum'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].sum(1)
    data['half_year_sales_mean'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].mean(1)
    data['half_year_sales_max'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].max(1)
    data['half_year_sales_min'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].min(1)
    data['half_year_sales_std'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].std(1)
    data['half_year_sales_median'] = data.loc[:, 'last_1_months_sales': 'last_6_months_sales'].median(1)
    half_year_sales_features = ['half_year_sales_sum', 'half_year_sales_mean', 'half_year_sales_max',
                               'half_year_sales_min', 'half_year_sales_std', 'half_year_sales_median']
    features = features + half_year_sales_features
    
    # quarter sales
    data['1st_quarter_sum'] = data.loc[:, 'last_1_months_sales': 'last_3_months_sales'].sum()
    data['1st_quarter_mean'] = data.loc[:, 'last_1_months_sales': 'last_3_months_sales'].mean()
    data['2nd_quarter_sum'] = data.loc[:, 'last_4_months_sales': 'last_6_months_sales'].sum()
    data['2nd_quarter_mean'] = data.loc[:, 'last_4_months_sales': 'last_6_months_sales'].mean()
    data['3rd_quarter_sum'] = data.loc[:, 'last_7_months_sales': 'last_9_months_sales'].sum()
    data['3rd_quarter_mean'] = data.loc[:, 'last_7_months_sales': 'last_9_months_sales'].mean()
    data['4th_quarter_sum'] = data.loc[:, 'last_10_months_sales': 'last_12_months_sales'].sum()
    data['4th_quarter_mean'] = data.loc[:, 'last_10_months_sales': 'last_12_months_sales'].mean()
    quarter_sales_features = ['1st_quarter_sum', '1st_quarter_mean', '2nd_quarter_sum', '2nd_quarter_mean', 
                             '3rd_quarter_sum', '3rd_quarter_mean', '4th_quarter_sum', '4th_quarter_mean']
    features = features + quarter_sales_features
    
    # trend
    data['1_2_diff'] = data['last_1_months_sales'] - data['last_2_months_sales']
    data['1_3_diff'] = data['last_1_months_sales'] - data['last_3_months_sales']
    data['1_4_diff'] = data['last_1_months_sales'] - data['last_4_months_sales']
    data['2_3_diff'] = data['last_2_months_sales'] - data['last_3_months_sales']
    data['2_4_diff'] = data['last_2_months_sales'] - data['last_4_months_sales']
    data['2_5_diff'] = data['last_2_months_sales'] - data['last_5_months_sales']
    data['3_4_diff'] = data['last_3_months_sales'] - data['last_4_months_sales']
    data['3_5_diff'] = data['last_3_months_sales'] - data['last_5_months_sales']
    data['3_6_diff'] = data['last_3_months_sales'] - data['last_6_months_sales']
    trend_features = ['1_2_diff', '1_3_diff', '1_4_diff', '2_3_diff', '2_4_diff', '2_5_diff', 
                     '3_4_diff', '3_5_diff', '3_6_diff']
    features = features + trend_features
    
    # model/province sales trend
    for agg in ['mean', 'sum']:
        tmp = data.groupby('model_id')['1_2_diff'].agg(agg).reset_index().rename(columns={'1_2_diff': 'model_1_2_diff_{}'.format(agg)})
        data = pd.merge(data, tmp, how='left', on='model_id')
        tmp = data.groupby('pro_id')['1_2_diff'].agg(agg).reset_index().rename(columns={'1_2_diff': 'pro_1_2_diff_{}'.format(agg)})
        data = pd.merge(data, tmp, how='left', on='pro_id')
        features.append('model_1_2_diff_{}'.format(agg))
        features.append('pro_1_2_diff_{}'.format(agg))
    
    # model and province sales trend
    for agg in ['mean', 'sum']:
        tmp = data.groupby(['model_id', 'pro_id'])['1_2_diff'].agg(agg).reset_index().rename(columns={'1_2_diff': 'pro_model_1_2_diff_{}'.format(agg)})
        data = pd.merge(data, tmp, how='left', on=['model_id', 'pro_id'])
        features.append('pro_model_1_2_diff_{}'.format(agg))
    
    
    # month on month growth
    data['1_2_mom_growth'] = data['last_1_months_sales'] / data['last_2_months_sales'] - 1
    data['1_3_mom_growth'] = data['last_1_months_sales'] / data['last_3_months_sales'] - 1
    data['2_3_mom_growth'] = data['last_2_months_sales'] / data['last_3_months_sales'] - 1
    data['2_4_mom_growth'] = data['last_2_months_sales'] / data['last_4_months_sales'] - 1
    data['3_4_mom_growth'] = data['last_3_months_sales'] / data['last_4_months_sales'] - 1
    data['3_5_mom_growth'] = data['last_3_months_sales'] / data['last_5_months_sales'] - 1
    data['4_5_mom_growth'] = data['last_4_months_sales'] / data['last_5_months_sales'] - 1
    data['4_6_mom_growth'] = data['last_4_months_sales'] / data['last_6_months_sales'] - 1
    mom_growth_features = ['1_2_mom_growth', '1_3_mom_growth', '2_3_mom_growth', '2_4_mom_growth',
                          '3_4_mom_growth', '3_5_mom_growth', '4_5_mom_growth', '4_6_mom_growth']
    features = features + mom_growth_features
    
    # year on year growth
    data['yoy_growth_sales'] = data['last_12_months_sales'] / data['label'] -1
    features.append('yoy_growth_sales')
    
    # month model sales
    tmp = data.groupby(['month', 'model_id'])['label'].agg('mean').reset_index().rename(columns={'label': 'month_model_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'model_id'])
    features.append('month_model_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'model_id'])['last_{}_months_sales'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_sales'.format(i): 'month_model_last_{}_months_sales'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'model_id'])
        features.append('month_model_last_{}_months_sales'.format(i))
    
    # month bodytype sales
    tmp = data.groupby(['month', 'body_id'])['label'].agg('mean').reset_index().rename(columns={'label': 'month_body_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'body_id'])
    features.append('month_body_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'body_id'])['last_{}_months_sales'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_sales'.format(i): 'month_body_last_{}_months_sales'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'body_id'])
        features.append('month_body_last_{}_months_sales'.format(i))
    

    # month province sales
    tmp = data.groupby(['month', 'pro_id'])['label'].agg('mean').reset_index().rename(columns={'label': 'month_pro_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'pro_id'])
    features.append('month_pro_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'pro_id'])['last_{}_months_sales'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_sales'.format(i): 'month_pro_last_{}_months_sales'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'pro_id'])
        features.append('month_pro_last_{}_months_sales'.format(i))
    
    # model, bodytype, month sales
    tmp = data.groupby(['month', 'model_id', 'body_id'])['label'].agg('mean').reset_index().rename(columns={'label': 'month_model_body_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'model_id', 'body_id'])
    features.append('month_model_body_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'model_id', 'body_id'])['last_{}_months_sales'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_sales'.format(i): 'month_model_body_last_{}_months_sales'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'model_id', 'body_id'])
        features.append('month_model_body_last_{}_months_sales'.format(i))
    
    
    # province, bodytype, model, month sales
    tmp = data.groupby(['month', 'pro_id', 'model_id', 'body_id'])['label'].agg('mean').reset_index().rename(columns={'label': 'month_pro_model_body_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'pro_id', 'model_id', 'body_id'])
    features.append('month_pro_model_body_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'pro_id', 'model_id', 'body_id'])['last_{}_months_sales'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_sales'.format(i): 'month_pro_model_body_last_{}_months_sales'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'pro_id', 'model_id', 'body_id'])
        features.append('month_pro_model_body_last_{}_months_sales'.format(i))
    
    
    # popularity monthly growth
    data['mom_1_2_growth_popularity'] = data['last_1_months_popularity'] / data['last_2_months_popularity'] - 1
    data['mom_1_3_growth_popularity'] = data['last_1_months_popularity'] / data['last_3_months_popularity'] - 1
    data['mom_2_3_growth_popularity'] = data['last_2_months_popularity'] / data['last_3_months_popularity'] - 1
    data['mom_2_4_growth_popularity'] = data['last_2_months_popularity'] / data['last_4_months_popularity'] - 1
    data['mom_3_4_growth_popularity'] = data['last_3_months_popularity'] / data['last_4_months_popularity'] - 1
    data['mom_3_5_growth_popularity'] = data['last_3_months_popularity'] / data['last_5_months_popularity'] - 1
    pop_mom_features = ['mom_1_2_growth_popularity', 'mom_1_3_growth_popularity',
                       'mom_2_3_growth_popularity', 'mom_2_4_growth_popularity',
                       'mom_3_4_growth_popularity', 'mom_3_5_growth_popularity']
    features = features + pop_mom_features
    
    # model popularity
    tmp = data.groupby(['month', 'model_id'])['popularity'].agg('mean').reset_index().rename(columns={'popularity': 'model_popularity_mean'})
    data = pd.merge(data, tmp, how='left', on=['month', 'model_id'])
    features.append('model_popularity_mean')
    for i in range(1, 7):
        tmp = data.groupby(['month', 'model_id'])['last_{}_months_popularity'.format(i)].agg('mean').reset_index().rename(columns={'last_{}_months_popularity'.format(i): 'model_last_{}_months_popularity'.format(i)})
        data = pd.merge(data, tmp, how='left', on=['month', 'model_id'])
        features.append('model_last_{}_months_popularity'.format(i))
    data['yoy_growth_popularity'] = data['last_12_months_popularity'] / data['popularity']
    features.append('yoy_growth_popularity')
    
    return data, features

In [232]:
def train_model(data, features, month, model):
    df = data.copy()
        
    train_idx = df['time_id'].between(7, month-1)
    test_idx = df['time_id'].between(month, month)
    
    model.fit(df[train_idx][features], df[train_idx]['label'])
    df['forecastVolum'] = model.predict(df[features])
    sub = df[test_idx][['id']]
    sub['forecastVolum'] = df[test_idx]['forecastVolum']
    return sub


def predict_by_month(data):
    
    model = lgb.LGBMRegressor(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
            max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
            n_estimators=600, subsample=0.9, colsample_bytree=0.7,
            )
    
    data['label'] = data['label'].apply(lambda x: x if x==np.NAN else math.log(x+1, 2))
    data['salesVolume'] = data['salesVolume'].apply(lambda x: x if x==np.NAN else math.log(x+1, 2))
    
    for month in [25, 26, 27, 28]:
        df, features = feature_engineering(data, month)
        cate_features = ['pro_id', 'model_id', 'body_id', 'year', 'month']
        for col in cate_features:
            df[col] = df[col].astype('category')
        all_features = features + cate_features
        sub = train_model(df, all_features, month, model)
        data.loc[(data.time_id==month),  'salesVolume'] = sub['forecastVolum'].values
        data.loc[(data.time_id==month),  'label'] = sub['forecastVolum'].values
    
    data['salesVolume'] = data['salesVolume'].apply(lambda x: x if x==np.NAN else (lg**(x))-1)
    sub = data.loc[(data.time_id >= 25),['id','salesVolume']]
    sub = sub.rename(columns={'salesVolume': 'forecastVolum'})
    sub['id'] = sub['id'].map(int)
    sub['forecastVolum'] = sub['forecastVolum'].map(round)
    return sub

In [None]:
if __name__ == '__main__':
    
    train_data = pd.read_csv('train_sales_data.csv')
    test_data = pd.read_csv('evaluation_public.csv')
    search_data = pd.read_csv('train_search_data.csv')
    
    train_data = data_preprocessing(train_data)
    test_data = data_preprocessing(test_data)
    search_data = data_preprocessing(search_data)
    
    # fill in bodytype for test data
    test_data['body_id'] = train_data['model_id'].map(train_data.drop_duplicates('model_id').set_index('model_id')['body_id'])

    data = pd.merge(train_data, search_data, how='left', on=['year', 'month', 'pro_id', 'model_id', 'time_id'])
    data = pd.concat([data, test_data])
    data['label'] = data['salesVolume']
    
    start = time.time()
    print('start training...')
    sub = predict_by_month(data)
    sub.to_csv('sub.csv', index=False)

start training...
