In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
from multiprocessing import Pool
from itertools import product
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# Import data


In [2]:
test = pd.read_csv('../input/test.csv')
shop = pd.read_csv('../input/shops.csv')
submission = pd.read_csv('../input/sample_submission.csv')
sales = pd.read_csv('../input/sales_train_v2.csv')
items = pd.read_csv('../input/items.csv')
item_category = pd.read_csv('../input/item_categories.csv')

In [3]:
sales['date'] = pd.to_datetime(sales.date, format = '%d.%m.%Y').dt.date
sales['month_num'] = sales['date'].apply(lambda x: x.month)

# Aggregate data
Since the test data is generated with combination of shops and items, we have to restructure train data to match the test data generation. 

In [4]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

#Compute all shops/items combinations
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

Outliers above are removed so we only include item price that less than 100000 and amount of sales that less or equal to 900.

In [5]:
sales = sales[sales.item_price<100000]
sales = sales[sales.item_cnt_day<=900]

# Aggregate data

Since the competition task is to make a monthly prediction, we need to aggregate the data to montly level before doing any encodings. The following code-cell serves just that purpose.

In [6]:
month_num = sales.groupby('date_block_num', as_index=False)['month_num'].first()

In [7]:
sales_m = sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': 'sum','item_price': np.mean,}).reset_index()
sales_m = pd.merge(grid,sales_m,on=['date_block_num','shop_id','item_id'],how='left').fillna(0)

sales_m = pd.merge(sales_m,items,on=['item_id'],how='left')
sales_m = pd.merge(sales_m,item_category, on=['item_category_id'], how='left')
sales_m = pd.merge(sales_m,  month_num, on = 'date_block_num', how='left')

In [8]:
sales_m.drop(columns=['item_name', 'item_category_name'], axis=1, inplace=True)

Add custom features

In [9]:
item_vol = pd.read_csv('../item_vol.csv')
item_sales_dynamics = pd.read_csv('../item_sales_dynamics.csv')
shop_sales_dynamics = pd.read_csv('../shop_sales_dynamics.csv')

In [10]:
item_vol.rename(columns = {'mean': 'item_id_item_price_mean', 'std': 'item_id_item_price_std', 
                          'max': 'item_id_item_price_max', 'min': 'item_id_item_price_min',
                          'mean/std': 'item_id_item_price_mean_div_std', 'max/min': 'item_id_item_price_max_div_min'}, inplace=True)

In [11]:
sales_m = sales_m.merge(item_vol, on = ['date_block_num', 'item_id'], how='left')

In [12]:
sales_m = sales_m.merge(item_sales_dynamics, on = ['date_block_num', 'item_id'], how='left')
sales_m = sales_m.merge(shop_sales_dynamics, on = ['date_block_num', 'shop_id'], how='left')

# Mean encoding

We perform mean encoding on all category data which are item_id, shop_id and item_category_id

In [13]:
for type_id in ['item_id','shop_id','item_category_id']:
    for column_id, agg_func, func_name in [('item_price',np.mean,'avg'),('item_cnt_day',np.sum,'sum'),('item_cnt_day',np.mean,'avg')]:

        mean_df = sales_m.groupby([type_id,'date_block_num']).agg(agg_func).reset_index()[[column_id,type_id,'date_block_num']]
        mean_df.columns = [type_id+'_'+func_name+'_'+column_id, type_id,'date_block_num']
        
        sales_m = pd.merge(sales_m,mean_df,on=['date_block_num', type_id],how='left')

We append test data into train data so we can create lag features on them.

In [14]:
temp_test = test.copy()
temp_test['date_block_num'] = 34
temp_test.drop('ID', axis=1, inplace=True)

temp_test = temp_test.merge(items, how='left', on='item_id')
temp_test = temp_test.merge(item_category, how='left', on='item_category_id')
temp_test.drop(['item_name', 'item_category_name'], axis=1, inplace=True)
temp_test['month_num'] = 11

In [15]:
sales_m = pd.concat([sales_m,temp_test], axis=0, ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


# Custom features (Alex and Artem)

Seasonality

In [18]:
"""holiday_months = [12, 2, 3]
bad_month = [1, 11, 5, 6, 7] # January after New Year, November before New Year, May and summer"""

In [22]:
seas_features = []
"""sales_m['is_december'] = sales_m['month_num'] == 12
sales_m['is_february'] = sales_m['month_num'] == 2
sales_m['is_march'] = sales_m['month_num'] == 3
sales_m['is_january'] = sales_m['month_num'] == 1
sales_m['is_november'] = sales_m['month_num'] == 11
sales_m['is_low_season'] = (sales_m['month_num'] >= 5) & (sales_m['month_num'] <= 7)
sales_m['is_august'] = sales_m['month_num'] == 8
sales_m['is_september'] = sales_m['month_num'] == 9
for col in sales_m.columns:
    if col.startswith('is'):
        seas_features.append(col)
        sales_m.loc[:, col] = sales_m[col].astype(int)"""

"sales_m['is_december'] = sales_m['month_num'] == 12\nsales_m['is_february'] = sales_m['month_num'] == 2\nsales_m['is_march'] = sales_m['month_num'] == 3\nsales_m['is_january'] = sales_m['month_num'] == 1\nsales_m['is_november'] = sales_m['month_num'] == 11\nsales_m['is_low_season'] = (sales_m['month_num'] >= 5) & (sales_m['month_num'] <= 7)\nsales_m['is_august'] = sales_m['month_num'] == 8\nsales_m['is_september'] = sales_m['month_num'] == 9\nfor col in sales_m.columns:\n    if col.startswith('is'):\n        seas_features.append(col)\n        sales_m.loc[:, col] = sales_m[col].astype(int)"

We create lags on 10 features. 9 features are from mean encoding and 1 feature is the item_cnt_day.

After several tries, we found that month lag intervals of 1, 2, 3, 4, 5, 6, 9 and 12 give best score in the leaderboard. Take note that 16GB of ram is unable to compute such large number of interval. We need at least 64GB of ram. 

In [23]:
lags_and_features = dict(zip(range(0,13), [[] for i in range(0,13)]))

In [24]:
"""lag_variables_all  = ['item_id_avg_item_price',
'item_id_sum_item_cnt_day',
'item_id_avg_item_cnt_day',
'shop_id_avg_item_price',
'shop_id_sum_item_cnt_day',
'shop_id_avg_item_cnt_day',
'item_category_id_sum_item_cnt_day',
'item_category_id_avg_item_cnt_day',
'item_cnt_day']
lags_all = [1, 2, 4, 9]"""


lag_variables_all  = ['item_id_avg_item_price',
'item_id_sum_item_cnt_day',
'item_id_avg_item_cnt_day',
'shop_id_avg_item_price',
'shop_id_sum_item_cnt_day',
'shop_id_avg_item_cnt_day',
'item_category_id_avg_item_price',
'item_category_id_sum_item_cnt_day',
'item_category_id_avg_item_cnt_day',
'item_cnt_day']
lags_all = [1, 2, 3, 4, 5, 6, 9, 12]

for lag in lags_all:
    lags_and_features[lag].extend(lag_variables_all)

"lag_variables_all  = ['item_id_avg_item_price',\n'item_id_sum_item_cnt_day',\n'item_id_avg_item_cnt_day',\n'shop_id_avg_item_price',\n'shop_id_sum_item_cnt_day',\n'shop_id_avg_item_cnt_day',\n'item_category_id_sum_item_cnt_day',\n'item_category_id_avg_item_cnt_day',\n'item_cnt_day']\nlags_all = [1, 2, 4, 9]"

In [25]:
vol_features = ['item_id_item_price_mean', 'item_id_item_price_std', 'item_id_item_price_max',
                'item_id_item_price_min','item_id_item_price_mean_div_std', 'item_id_item_price_max_div_min']

vol_features_lags = [1, 2]
for lag in vol_features_lags:
    lags_and_features[lag].extend(vol_features)

In [26]:
sales_features = []
for window in [2,5,12]:
    sales_features.append('item_id_item_cnt_day_sma_{}'.format(window))
for window in [1, 2, 5, 12]:
    sales_features.append('item_id_item_cnt_day_diff_{}'.format(window))    

for window in [2,5,12]:
    sales_features.append('shop_id_item_cnt_day_sma_{}'.format(window))
for window in [1, 2, 5, 12]:
    sales_features.append('shop_id_item_cnt_day_diff_{}'.format(window))   

sales_lags = [1]
for lag in sales_lags:
    lags_and_features[lag].extend(sales_features)

In [27]:
seas_lag = [0]
for lag in seas_lag:
    lags_and_features[lag].extend(seas_features)

In [28]:
"""lag_variables  = ['item_id_avg_item_price',
'item_id_sum_item_cnt_day',
'item_id_avg_item_cnt_day',
'shop_id_avg_item_price',
'shop_id_sum_item_cnt_day',
'shop_id_avg_item_cnt_day',
'item_category_id_avg_item_price',
'item_category_id_sum_item_cnt_day',
'item_category_id_avg_item_cnt_day',
'item_cnt_day']
"""


"""vol_features = ['item_id_item_price_mean', 'item_id_item_price_std', 'item_id_item_price_max',
                'item_id_item_price_min','item_id_item_price_mean_div_std', 'item_id_item_price_max_div_min']

vol_features_lags = [1, 2]

sales_features = []
for window in [2,5,12]:
    sales_features.append('item_id_item_cnt_day_sma_{}'.format(window))
for window in [1, 2, 5, 12]:
    sales_features.append('item_id_item_cnt_day_diff_{}'.format(window))    

for window in [2,5,12]:
    sales_features.append('shop_id_item_cnt_day_sma_{}'.format(window))
for window in [1, 2, 5, 12]:
    sales_features.append('shop_id_item_cnt_day_diff_{}'.format(window))   

#Limited by computation resource
#lags = [1]
#lags = [1, 2, 3, 4, 5, 6, 9, 12] # memory error is expected"""

"lag_variables  = ['item_id_avg_item_price',\n'item_id_sum_item_cnt_day',\n'item_id_avg_item_cnt_day',\n'shop_id_avg_item_price',\n'shop_id_sum_item_cnt_day',\n'shop_id_avg_item_cnt_day',\n'item_category_id_avg_item_price',\n'item_category_id_sum_item_cnt_day',\n'item_category_id_avg_item_cnt_day',\n'item_cnt_day']\n"

"vol_features = ['item_id_item_price_mean', 'item_id_item_price_std', 'item_id_item_price_max',\n                'item_id_item_price_min','item_id_item_price_mean_div_std', 'item_id_item_price_max_div_min']\n\nvol_features_lags = [1, 2]\n\nsales_features = []\nfor window in [2,5,12]:\n    sales_features.append('item_id_item_cnt_day_sma_{}'.format(window))\nfor window in [1, 2, 5, 12]:\n    sales_features.append('item_id_item_cnt_day_diff_{}'.format(window))    \n\nfor window in [2,5,12]:\n    sales_features.append('shop_id_item_cnt_day_sma_{}'.format(window))\nfor window in [1, 2, 5, 12]:\n    sales_features.append('shop_id_item_cnt_day_diff_{}'.format(window))   \n\n#Limited by computation resource\n#lags = [1]\n#lags = [1, 2, 3, 4, 5, 6, 9, 12] # memory error is expected"

In [None]:
for lag, features in lags_and_features.iteritems():
    if len(features) > 0:
        sales_new_df = sales_m.copy()
        sales_new_df.date_block_num+=lag
        sales_new_df = sales_new_df[['date_block_num','shop_id','item_id']+features]
        sales_new_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in features]
        sales_m = sales_m.merge(sales_new_df,on=['date_block_num','shop_id','item_id'] ,how='left')



In [None]:
sales_m[sales_m.date_block_num == 34]

Fill missing values

In [None]:
for feat in sales_m.columns:
    if 'item_cnt' in feat:
        sales_m[feat]=sales_m[feat].fillna(0)
    elif 'item_price' in feat:
        sales_m[feat]=sales_m[feat].fillna(sales_m[feat].median())

Drop uneccesary columns and take data only after 12 since the most lag month interval is 12.

In [29]:
#cols_to_drop = lag_variables[:-1] + ['item_price']
sales_m = sales_m[sales_m['date_block_num']>12]

[1, 2, 3, 4, 5, 6, 9, 'item_price']

# Validation

We use holdout scheme for cross validation. We use sales month from 12 to 32 for training, month 33 for validation and month 34 for testing. 

Alex and Artem way

In [None]:
X = sales_m.drop(cols_to_drop, axis=1)
X['item_cnt_day'].clip_upper(40, inplace=True)
X['item_cnt_day'].clip_lower(0, inplace=True)
X.to_csv('../input/X.csv', index=False)