In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



from itertools import product
from tqdm import tqdm
import calendar
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
# item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0].lower())
shops['city_id'] = LabelEncoder().fit_transform(shops['city'])

sales['item_category_id'] = sales.item_id.map(items.item_category_id)
sales.item_cnt_day = sales.item_cnt_day.clip(0, 20)
# sales = sales[sales.item_price < 30000]
sales['city_id'] = sales.shop_id.map(shops.city_id)

index_cols = ['date_block_num', 'shop_id', 'item_id']

train = []
# block = sales.date_block_num.unique()

for block in tqdm(sales.date_block_num.unique()):
    cur_items = sales.loc[sales.date_block_num == block, 'item_id'].unique()
    cur_shops = sales.loc[sales.date_block_num == block, 'shop_id'].unique()
    train.append(
        np.array(list(
            product(*[[block], cur_shops, cur_items])
        ))
    )
    
train = pd.DataFrame(np.vstack(train), columns=index_cols, dtype=np.int32)


# In[ ]:


test['date_block_num'] = np.full(test.shape[0], 34)


# In[ ]:


group = sales.groupby(index_cols).agg({'item_cnt_day': 'sum'})
group.columns = ['item_cnt_month']
group.reset_index(inplace= True)
train = pd.merge(train, group, on= index_cols, how= 'left')
train.item_cnt_month = train.item_cnt_month.fillna(0).clip(0, 20)


length_train = train.shape[0]


# In[ ]:


train = train.append(test.drop(columns=['ID']), ignore_index=True)


print ('data created')
print ('feature creation starting')


y = train.item_cnt_month[:length_train]

def num_days(block):
    y = int(2013 + block/12)
    m = 1 + block % 12
    return max(calendar.monthcalendar(y, m)[-1])

def wday(block, d):
    y = int(2013 + block/12)
    m = 1 + block % 12
    cal = calendar.monthcalendar(y, m)
    return sum([1 for w in cal if w[d]])


# In[ ]:


def lag_feature(df, lags, col='item_cnt_month'):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in tqdm(lags):
        name = col+'_lag_'+str(i)
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', name]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        df[name] = df[name].astype('float16')
    return df


# In[ ]:


def add(train, index_cols, name, col='item_cnt_month', func='mean'):
    group = train[train.date_block_num.isin(sales.date_block_num.unique())].groupby(index_cols).agg({col: func})
    group.columns = [name]
    group.reset_index(inplace= True)
    train = pd.merge(train, group, on= index_cols, how= 'left')
    train[name] = train[name].fillna(0).astype('float16')
    return train


# In[ ]:


train['year'] = (train.date_block_num/12 + 2013).astype(int).astype('int32')
train['month'] = (train.date_block_num % 12 + 1).astype(int).astype('int32')

sales['month'] = (sales.date_block_num % 12 + 1).astype(int).astype('int32')


# In[ ]:


train['num_days'] = train.date_block_num.map(pd.Series([num_days(i) for i in range(40)])).astype('int32')

train['num_sat'] = train.date_block_num.map(pd.Series([wday(i, 5) for i in range(40)])).astype('int32')
# for d, wk in enumerate(['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']):
#     train['num_'+ wk] = train.date_block_num.map(pd.Series([wday(i, d) for i in range(40)]))


# In[ ]:


prices       = sales.groupby('item_id' ).item_price.mean()
prices_month = sales.groupby('month'   ).item_price.mean()
prices_shop  = sales.groupby('shop_id' ).item_price.mean()

train['item_category_id'] = train.item_id.map(items.item_category_id).astype('int32')
train['price'] = train.item_id.map(prices).astype('float16')
train['price_month'] = train.month.map(prices_month).astype('float16')
train['price_shop'] = train.shop_id.map(prices_shop).astype('float16')
train['city_id'] = train.shop_id.map(shops.city_id)


print ('target encoding started')

# In[ ]:


# [train.date_block_num.isin(sales.date_block_num.unique())]
block_sales = sales.date_block_num.unique()

count_month = train[:length_train].groupby('month'  ).item_cnt_month.mean()
count_shop  = train[:length_train].groupby('shop_id').item_cnt_month.mean()
count_item  = train[:length_train].groupby('item_id').item_cnt_month.mean()

train['cnt_month'] = train.month.map(count_month).astype('float16')
train['cnt_shop'] = train.shop_id.map(count_shop).astype('float16')
train['cnt_item'] = train.item_id.map(count_item).astype('float16')


# In[ ]:


# train = add(train, ['date_block_num', 'item_id'], 'avg_cnt_month-item')
train = add(train, ['date_block_num', 'shop_id'], 'avg_cnt_month-shop')
train = add(train, ['item_id', 'shop_id'], 'avg_cnt_item-shop')
train = add(train, ['item_id', 'city_id'], 'avg_cnt_item-city')

train = add(train, ['date_block_num', 'item_id'], 'avg_cnt_month-item')
# train = add(train, ['month', 'item_id'], 'avg_cnt_month')


# In[ ]:


## lag

print ('lag feature creation starting')

lag_features = [
    'item_cnt_month', 
    'cnt_shop', 
    'cnt_month', 
    'cnt_item', 
    'avg_cnt_month-item', 
    'avg_cnt_month-shop', 
    'avg_cnt_item-shop', 
    'avg_cnt_item-city',
    'price', 
    'price_month', 
    'price_shop', 
#     'avg_cnt_month'
]


for col in lag_features:
    train = lag_feature(train, [1, 2, 3, 12], col)
    print (col)


df = train.drop(columns= lag_features[1:])
df = df[df.date_block_num > 2]
# df.fillna(0, inplace= True)

X_train = df[df.date_block_num < 33].drop('item_cnt_month', axis=1)
x_val   = df[df.date_block_num ==33].drop('item_cnt_month', axis=1)
test    = df[df.date_block_num ==34].drop('item_cnt_month', axis=1)

y     = df[df.date_block_num < 33].item_cnt_month
y_val = df[df.date_block_num ==33].item_cnt_month

loc = './'

print ('model creation starting')


category_columns = ['shop_id', 'item_id', 'year', 'month', 'item_category_id', 'city_id']

# model = CatBoostRegressor(2000, early_stopping_rounds=300, 
#                           learning_rate= 0.1, 
#                           random_seed=1, 
#                           task_type= 'GPU', 
#                           cat_features=category_columns)

# print ('model fitting starting')

# model.fit(X_train, y, cat_features= category_columns, eval_set=(x_val, y_val), verbose= 40, 
#           save_snapshot= True, snapshot_file= loc+'cat_3_snapshot4.snap', snapshot_interval= 120)
# model.save_model(loc + 'cat_3.cbm')


feature_name = X_train.columns.tolist()

params = {
    'objective': 'mse',
    'metric': 'rmse',
    'num_leaves': 2 ** 7 - 1,
    'learning_rate': 0.005,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'seed': 1,
    'verbose': 1
}

lgb_train = lgb.Dataset(X_train[feature_name], y)
lgb_eval = lgb.Dataset(x_val[feature_name], y_val, reference=lgb_train)

evals_result = {}
model = lgb.train(
        params, 
        lgb_train,
        num_boost_round=3000,
        valid_sets=(lgb_train, lgb_eval), 
        feature_name = feature_name,
        categorical_feature = category_columns,
        verbose_eval=5, 
        evals_result = evals_result,
        early_stopping_rounds = 100
)



sub = pd.DataFrame({
    'ID': np.arange(0, test.shape[0]), 
    'item_cnt_month' : model.predict(test)
})
sub.item_cnt_month = sub.item_cnt_month.clip(0, 20)
sub[['ID', 'item_cnt_month']].to_csv(loc + 'sub_cat_5.csv', index=False)

praint ('Done')

/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv


100%|██████████| 34/34 [00:12<00:00,  2.62it/s]


data created
feature creation starting
target encoding started
lag feature creation starting


100%|██████████| 4/4 [00:29<00:00,  7.33s/it]
100%|██████████| 4/4 [00:29<00:00,  7.28s/it]
100%|██████████| 4/4 [00:30<00:00,  7.53s/it]
100%|██████████| 4/4 [00:30<00:00,  7.72s/it]
100%|██████████| 4/4 [00:31<00:00,  7.95s/it]
100%|██████████| 4/4 [00:32<00:00,  8.17s/it]
100%|██████████| 4/4 [00:33<00:00,  8.44s/it]
100%|██████████| 4/4 [00:34<00:00,  8.62s/it]
100%|██████████| 4/4 [00:35<00:00,  8.79s/it]
100%|██████████| 4/4 [00:35<00:00,  9.00s/it]
100%|██████████| 4/4 [00:36<00:00,  9.14s/it]


model creation starting


New categorical_feature is ['city_id', 'item_category_id', 'item_id', 'month', 'shop_id', 'year']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[5]	training's rmse: 1.19743	valid_1's rmse: 1.12627
[10]	training's rmse: 1.18161	valid_1's rmse: 1.11495
[15]	training's rmse: 1.166	valid_1's rmse: 1.10473
[20]	training's rmse: 1.15122	valid_1's rmse: 1.09429
[25]	training's rmse: 1.13703	valid_1's rmse: 1.08433
[30]	training's rmse: 1.12359	valid_1's rmse: 1.07589
[35]	training's rmse: 1.11032	valid_1's rmse: 1.0677
[40]	training's rmse: 1.09718	valid_1's rmse: 1.06002
[45]	training's rmse: 1.08413	valid_1's rmse: 1.05249
[50]	training's rmse: 1.07242	valid_1's rmse: 1.04481
[55]	training's rmse: 1.06067	valid_1's rmse: 1.0378
[60]	training's rmse: 1.04875	valid_1's rmse: 1.03131
[65]	training's rmse: 1.0381	valid_1's rmse: 1.02474
[70]	training's rmse: 1.02756	valid_1's rmse: 1.01884
[75]	training's rmse: 1.01704	valid_1's rmse: 1.013
[80]	training's rmse: 1.00724	valid_1's rmse: 1.00715
[85]	training's rmse: 0.997525	valid_1's rmse: 1.00246
[90]	training's rmse: 0.987