In [15]:
# not much comments here, feel free to contact @ fb.com/noallynoclan
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import gc
import itertools
import numpy as np
import os
import pandas as pd
import sys

DATA_DIR = os.path.join(os.getcwd(), 'data')
DUMP_DIR = os.path.join(os.getcwd(), 'dump')
BLOCK = 'date_block_num'
TARGET = 'item_cnt_month'
INDEX_COLS = ['date_block_num', 'shop_id', 'item_id']
VAL_COLS = ['item_price', 'item_cnt_month']
DROP_COLS = ['ID', 'item_category_id']

def proc(df): # downsizes to 32b
    pref_ord = ['ID', *INDEX_COLS, 'item_price', 'item_cnt_month']
    cols_ord = [*[c for c in pref_ord if c in df], *[c for c in df if c not in pref_ord]]
    for col in df:
        if df[col].dtype not in ('int32', 'object'):
            if not df[col].isnull().sum():
                col_int32 = df[col].astype(np.int32)
                if np.allclose(df[col], col_int32): df[col] = col_int32
        if df[col].dtype == 'float64': df[col] = df[col].astype(np.float32)
    gc.collect()
    return df.reindex(columns=cols_ord)

def describe(df): # prints shape, size, nulls, object & 64b cols
    cols = {'NULLS': [], 'O': [], 'N64': [], 'N32': []}
    for col in df:
        nulls = df[col].isnull().sum()
        if nulls: cols['NULLS'].append((col, nulls))
        if df[col].dtype in ('int32', 'float32'): cols['N32'].append(col)
        elif df[col].dtype in ('int64', 'float64'): cols['N64'].append(col)
        elif df[col].dtype  == 'object': cols['O'].append(col)
    print(df.shape, int(sys.getsizeof(df) / 1_000_000), end='MB, ')
    print({dtype: cols[dtype] for dtype in ['NULLS', 'O', 'N64'] if cols[dtype]})
    return df[::int(len(df)/3)]

In [None]:
# READ DATA
def read(file): # reads, filters columns
    print(file, end=' ')
    drop_cols = {'item_category_name', 'item_name', 'date', 'shop_name'}
    df = pd.read_csv(os.path.join(DATA_DIR, file + '.csv'))
    df = df.drop(drop_cols.intersection(df.columns), axis=1)
    df = proc(df)
    describe(df) 
    return df

data = {f: read(f) for f in ['item_categories', 'items', 'sales_train', 'shops', 'test']}
price_lim = data['sales_train']['item_price'].quantile(0.999)
data['sales_train']['item_price'] = np.minimum(data['sales_train']['item_price'], price_lim)

In [None]:
## PREPROCESSING
# MERGE DATA INTO GRID
data['train'] = (data['sales_train'].groupby(INDEX_COLS, as_index=False)
                 .agg({'item_price': np.mean, 'item_cnt_day': np.sum})
                 .rename(columns={'item_cnt_day': 'item_cnt_month'}))
data['test'][BLOCK] = 34
df = proc(pd.concat([data['train'], data['test']]))
grid = [] 
for block in df[BLOCK].unique():
    shops = df[df[BLOCK] == block]['shop_id'].unique()
    items = df[df[BLOCK] == block]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[[block], shops, items])), dtype=np.int32))
grid = pd.DataFrame(np.vstack(grid), columns=INDEX_COLS, dtype=np.int32)
df = grid.merge(df, how='left', on=INDEX_COLS)

# ADDITION INFO
df['item_cnt_month'] = np.clip(df['item_cnt_month'].fillna(0), 0, 20)
df['ID'] = df['ID'].fillna(-1).astype(np.int32)
df = df.merge(data['items'], how='left', on='item_id')
df['year'] = 2013 + df[BLOCK] // 12
df['month'] = df[BLOCK] % 12 + 1
df = proc(df).sort_values(INDEX_COLS)

# ADD LAGS
for lag in [1, 2, 3, 6, 12]:
    print('lagging:', lag)
    lagged = df[INDEX_COLS + VAL_COLS].copy()
    lagged[BLOCK] += lag
    df = df.merge(lagged, how='left', on=INDEX_COLS, suffixes=('', '_{}_lag'.format(lag)))

# ADD ENCODINGS
def encode(df, block, encs, vals): # big mess here
    _ = lambda cols: ['_' + col for col in cols]
    cols = set(sum(vals.values(), []))
    for col in cols:
        df['_' + col] = df[col].notnull()
    pivot = df.groupby([block, *encs])[[*cols, *_(cols)]].sum()
    df = df.drop(_(cols), axis=1)
    if 'mean' in vals:
        mean_vals = [*vals['mean'], *_(vals['mean'])]
        enc = pivot.groupby(encs)[mean_vals].cumsum() - pivot[mean_vals]
        enc = enc[vals['mean']].div(enc[_(vals['mean'])].values)
        df = df.join(enc, how='left', on=[block, *encs], rsuffix=('_'.join(['_mean', *encs])))
    if 'max' in vals:
        enc = pivot.groupby(encs)[vals['max']].cummax().reset_index()
        enc[block] += 1
        df = df.merge(enc, how='left', on=[block, *encs], suffixes=('', '_'.join(['_max', *encs])))
    return df

for enc_cols in (['shop_id', 'item_category_id'], ['shop_id', 'item_id'], ['shop_id'], ['item_id']):
    print('encoding:', enc_cols)
    df = encode(df, BLOCK, enc_cols, {'mean': VAL_COLS, 'max': ['item_cnt_month']})

df = proc(df)
df.to_pickle(os.path.join(DUMP_DIR, 'data.pkl'))
describe(df)

In [7]:
# FIT THE MODEL
df = proc(pd.read_pickle(os.path.join(DUMP_DIR, 'data.pkl')))
features = df.columns.difference([*INDEX_COLS, *VAL_COLS, *DROP_COLS])

train_val_mask = df[BLOCK].between(12, 33)
train, val = train_test_split(df[train_val_mask], test_size=0.05, stratify=df[train_val_mask][target])
Train = xgb.DMatrix(train[features], train[target])
Val = xgb.DMatrix(val[features], val[target])

params = {'booster': 'gbtree', #'gbtree',
          'eta': .1,
          'min_child_weight': 100,
          'max_depth': 6,
          'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'silent': True,
          'nthread': 4}
model = xgb.train(params, Train, 1000, [(Train, 'Train'), (Val, 'val')], 
                  verbose_eval=10, early_stopping_rounds=10)

In [9]:
# SUBMITION
test = df[df[BLOCK] == 34].copy()
test['item_cnt_month'] = model.predict(xgb.DMatrix(test[features]))
test[['ID', 'item_cnt_month']].sort_values('ID').to_csv('sub.csv', index=False)