In [39]:
import pandas as pd
import numpy as np

from itertools import product
from sklearn.preprocessing import LabelEncoder
import calendar

import time
from tqdm import tqdm
import sys
import gc

In [2]:
# Load data

items = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\items.csv')
shops = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\shops.csv')
cats = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\item_categories.csv')
train = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\sales_train.csv')

# set index to ID to avoid droping it later
test  = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\test.csv').set_index('ID')

In [3]:
# Replace minus price by median

median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

In [4]:
# Several shops are duplicates of each other (according to its name). Fix train and test set.

train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# Shops/Cats/Items preprocessing
Observations:

- Each shop_name starts with the city name.
- Each category contains type and subtype in its name.

In [5]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops['shop_category'] = 'NA'

for index, row in tqdm(shops.iterrows()):
    shops['shop_category'].iloc[index]

    if 'ТЦ' in shops['shop_name'].iloc[index]:
        shops['shop_category'].iloc[index] = '1'
    elif  'ТРЦ' in shops['shop_name'].iloc[index] or 'ТРК' in shops['shop_name'].iloc[index]:
        shops['shop_category'].iloc[index] = '2'
    else:
        shops['shop_category'].iloc[index] = '3'

shops.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
60it [00:00, 1071.47it/s]


Unnamed: 0,shop_name,shop_id,city,city_code,shop_category
0,"!Якутск Орджоникидзе, 56 фран",0,Якутск,29,3
1,"!Якутск ТЦ ""Центральный"" фран",1,Якутск,29,1
2,"Адыгея ТЦ ""Мега""",2,Адыгея,0,1
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха,1,2
4,"Волжский ТЦ ""Волга Молл""",4,Волжский,2,1


In [6]:
shops = shops[['shop_id','city_code','shop_category']]
shops.head()

Unnamed: 0,shop_id,city_code,shop_category
0,0,29,3
1,1,29,1
2,2,0,1
3,3,1,2
4,4,2,1


In [7]:

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]
cats.head()

Unnamed: 0,item_category_id,type_code,subtype_code
0,0,0,29
1,1,1,9
2,2,1,10
3,3,1,11
4,4,1,13


In [8]:
items.drop(['item_name'], axis=1, inplace=True)
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


# Monthly sales
Test set is a product of some shops and some items within 34 month. There are 5100 items * 42 shops = 214200 pairs. 363 items are new compared to the train. Hence, for the most of the items in the test set target value should be zero. In the other hand train set contains only pairs which were sold or returned in the past. Tha main idea is to calculate monthly sales and extend it with zero sales for each unique pair within the month. This way train data will be similar to test data.


In [9]:
# Get info
len(list(set(test.item_id) - set(test.item_id).intersection(set(train.item_id)))), len(list(set(test.item_id))), len(test)

(363, 5100, 214200)

In [10]:
# Create empty dataframe with all unique pairs
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id
114910,0,2,19
117150,0,2,27
120623,0,2,28
118316,0,2,29
114602,0,2,32


In [11]:
# Merge empty dataframe with history Sales

train['revenue'] = train['item_price'] *  train['item_cnt_day']
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,2,19,0.0
1,0,2,27,1.0
2,0,2,28,0.0
3,0,2,29,0.0
4,0,2,32,0.0


In [12]:
# Add test dataframe
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

matrix.tail()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0
11128049,34,45,969,0.0


# Join and encode category features
Shops/Items/category

In [13]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code
0,0,2,19,0.0,0,1,40,11,4
1,0,2,27,1.0,0,1,19,5,10
2,0,2,28,0.0,0,1,30,8,55
3,0,2,29,0.0,0,1,23,5,16
4,0,2,32,0.0,0,1,40,11,4


In [14]:
# Function for category features encoding

def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [15]:
# Create fuatures for Item
matrix = lag_feature(matrix, [1,2,3,4,5,6,12], 'item_cnt_month')

In [16]:
group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,,,,,,,


In [17]:
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [18]:
group = matrix.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shop_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [19]:
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_cat_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')
matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [20]:
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [21]:
group = matrix.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_type_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'type_code'], how='left')
matrix['date_shop_type_avg_item_cnt'] = matrix['date_shop_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_type_avg_item_cnt')
matrix.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [22]:
group = matrix.groupby(['date_block_num', 'shop_id', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_subtype_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix['date_shop_subtype_avg_item_cnt'] = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_subtype_avg_item_cnt')
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [23]:
group = matrix.groupby(['date_block_num', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'city_code'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_city_avg_item_cnt')
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [24]:
group = matrix.groupby(['date_block_num', 'item_id', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'city_code'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_item_city_avg_item_cnt')
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [25]:
group = matrix.groupby(['date_block_num', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_type_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'type_code'], how='left')
matrix['date_type_avg_item_cnt'] = matrix['date_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_type_avg_item_cnt')
matrix.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [26]:
group = matrix.groupby(['date_block_num', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_subtype_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'subtype_code'], how='left')
matrix['date_subtype_avg_item_cnt'] = matrix['date_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_subtype_avg_item_cnt')
matrix.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


In [27]:
group = matrix.groupby(['date_block_num', 'item_id', 'shop_category']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_shop_category_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'shop_category'], how='left')
matrix['date_item_shop_category_avg_item_cnt'] = matrix['date_item_shop_category_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_item_shop_category_avg_item_cnt')
matrix.drop(['date_item_shop_category_avg_item_cnt'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,date_item_shop_category_avg_item_cnt_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,


# Shops coordinates

In [28]:
shops = pd.read_csv(r'C:\Users\anshch\Documents\Demand_Forecast\Data\shops.csv')

shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0].lower())
shops.loc[shops.city == '!якутск', 'city'] = 'якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])

coords = dict()
coords['якутск'] = (62.028098, 129.732555, 4)
coords['адыгея'] = (44.609764, 40.100516, 3)
coords['балашиха'] = (55.8094500, 37.9580600, 1)
coords['волжский'] = (53.4305800, 50.1190000, 3)
coords['вологда'] = (59.2239000, 39.8839800, 2)
coords['воронеж'] = (51.6720400, 39.1843000, 3)
coords['выездная'] = (0, 0, 0)
coords['жуковский'] = (55.5952800, 38.1202800, 1)
coords['интернет-магазин'] = (0, 0, 0)
coords['казань'] = (55.7887400, 49.1221400, 4)
coords['калуга'] = (54.5293000, 36.2754200, 4)
coords['коломна'] = (55.0794400, 38.7783300, 4)
coords['красноярск'] = (56.0183900, 92.8671700, 4)
coords['курск'] = (51.7373300, 36.1873500, 3)
coords['москва'] = (55.7522200, 37.6155600, 1)
coords['мытищи'] = (55.9116300, 37.7307600, 1)
coords['н.новгород'] = (56.3286700, 44.0020500, 4)
coords['новосибирск'] = (55.0415000, 82.9346000, 4)
coords['омск'] = (54.9924400, 73.3685900, 4)
coords['ростовнадону'] = (47.2313500, 39.7232800, 3)
coords['спб'] = (59.9386300, 30.3141300, 2)
coords['самара'] = (53.2000700, 50.1500000, 4)
coords['сергиев'] = (56.3000000, 38.1333300, 4)
coords['сургут'] = (61.2500000, 73.4166700, 4)
coords['томск'] = (56.4977100, 84.9743700, 4)
coords['тюмень'] = (57.1522200, 65.5272200, 4)
coords['уфа'] = (54.7430600, 55.9677900, 4)
coords['химки'] = (55.8970400, 37.4296900, 1)
coords['цифровой'] = (0, 0, 0)
coords['чехов'] = (55.1477000, 37.4772800, 4)
coords['ярославль'] = (57.6298700, 39.8736800, 2) 

shops['city_coord_1'] = shops['city'].apply(lambda x: coords[x][0])
shops['city_coord_2'] = shops['city'].apply(lambda x: coords[x][1])
shops['country_part'] = shops['city'].apply(lambda x: coords[x][2])

shops = shops[['shop_id', 'city_coord_1', 'city_coord_2', 'country_part']]
shops.head()

Unnamed: 0,shop_id,city_coord_1,city_coord_2,country_part
0,0,62.028098,129.732555,4
1,1,62.028098,129.732555,4
2,2,44.609764,40.100516,3
3,3,55.80945,37.95806,1
4,4,53.43058,50.119,3


In [29]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,date_item_shop_category_avg_item_cnt_lag_1,city_coord_1,city_coord_2,country_part
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,44.609764,40.100516,3
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,44.609764,40.100516,3
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,44.609764,40.100516,3
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,44.609764,40.100516,3
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,44.609764,40.100516,3


# Trend features
Price trend for the last N months.

In [30]:
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, 'date_item_avg_item_price')

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_avg_item_price_lag_4,date_item_avg_item_price_lag_5,date_item_avg_item_price_lag_6,delta_price_lag_1,delta_price_lag_2,delta_price_lag_3,delta_price_lag_4,delta_price_lag_5,delta_price_lag_6,delta_price_lag
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,,,,0.0
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,,,,0.0
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,,,,0.0
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,,,,0.0
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,,,,0.0


In [31]:
# Clean dataframe from unuse columns

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,date_item_shop_category_avg_item_cnt_lag_1,city_coord_1,city_coord_2,country_part,delta_price_lag
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,,44.609764,40.100516,3,0.0
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,,44.609764,40.100516,3,0.0
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,,44.609764,40.100516,3,0.0
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,,44.609764,40.100516,3,0.0
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,,44.609764,40.100516,3,0.0


In [32]:
# Last month shop revenue trend

group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'delta_revenue')

matrix.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,date_item_shop_category_avg_item_cnt_lag_1,city_coord_1,city_coord_2,country_part,delta_price_lag,delta_revenue_lag_1
0,0,2,19,0.0,0,1,40,11,4,,...,,,,,,44.609764,40.100516,3,0.0,
1,0,2,27,1.0,0,1,19,5,10,,...,,,,,,44.609764,40.100516,3,0.0,
2,0,2,28,0.0,0,1,30,8,55,,...,,,,,,44.609764,40.100516,3,0.0,
3,0,2,29,0.0,0,1,23,5,16,,...,,,,,,44.609764,40.100516,3,0.0,
4,0,2,32,0.0,0,1,40,11,4,,...,,,,,,44.609764,40.100516,3,0.0,


# Date features

In [33]:
matrix['month'] = matrix['date_block_num'] % 12

# Number of days in a month. There are no leap years.

days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)

Months since the last sale for each shop/item pair and for item only. I use programing approach.

Create HashTable with key equals to {shop_id,item_id} and value equals to date_block_num. Iterate data from the top. Foreach row if {row.shop_id,row.item_id} is not present in the table, then add it to the table and set its value to row.date_block_num. if HashTable contains key, then calculate the difference beteween cached value and row.date_block_num.

In [34]:
cache = {}
matrix['item_shop_last_sale'] = -1
matrix['item_shop_last_sale'] = matrix['item_shop_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        matrix.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num 

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_subtype_avg_item_cnt_lag_1,date_item_shop_category_avg_item_cnt_lag_1,city_coord_1,city_coord_2,country_part,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale
0,0,2,19,0.0,0,1,40,11,4,,...,,,44.609764,40.100516,3,0.0,,0,31,-1
1,0,2,27,1.0,0,1,19,5,10,,...,,,44.609764,40.100516,3,0.0,,0,31,-1
2,0,2,28,0.0,0,1,30,8,55,,...,,,44.609764,40.100516,3,0.0,,0,31,-1
3,0,2,29,0.0,0,1,23,5,16,,...,,,44.609764,40.100516,3,0.0,,0,31,-1
4,0,2,32,0.0,0,1,40,11,4,,...,,,44.609764,40.100516,3,0.0,,0,31,-1


In [35]:
cache = {}
matrix['item_last_sale'] = -1
matrix['item_last_sale'] = matrix['item_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = row.item_id
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        if row.date_block_num>last_date_block_num:
            matrix.at[idx, 'item_last_sale'] = row.date_block_num - last_date_block_num
            cache[key] = row.date_block_num

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,date_item_shop_category_avg_item_cnt_lag_1,city_coord_1,city_coord_2,country_part,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale
0,0,2,19,0.0,0,1,40,11,4,,...,,44.609764,40.100516,3,0.0,,0,31,-1,-1
1,0,2,27,1.0,0,1,19,5,10,,...,,44.609764,40.100516,3,0.0,,0,31,-1,-1
2,0,2,28,0.0,0,1,30,8,55,,...,,44.609764,40.100516,3,0.0,,0,31,-1,-1
3,0,2,29,0.0,0,1,23,5,16,,...,,44.609764,40.100516,3,0.0,,0,31,-1,-1
4,0,2,32,0.0,0,1,40,11,4,,...,,44.609764,40.100516,3,0.0,,0,31,-1,-1


In [36]:
# Months since the first sale for each shop/item pair and for item only.
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,city_coord_2,country_part,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
0,0,2,19,0.0,0,1,40,11,4,,...,40.100516,3,0.0,,0,31,-1,-1,0,0
1,0,2,27,1.0,0,1,19,5,10,,...,40.100516,3,0.0,,0,31,-1,-1,0,0
2,0,2,28,0.0,0,1,30,8,55,,...,40.100516,3,0.0,,0,31,-1,-1,0,0
3,0,2,29,0.0,0,1,23,5,16,,...,40.100516,3,0.0,,0,31,-1,-1,0,0
4,0,2,32,0.0,0,1,40,11,4,,...,40.100516,3,0.0,,0,31,-1,-1,0,0


Because of the using 12 as lag value drop first 12 months. Also drop all the columns with this month calculated values (other words which can not be calcucated for the test set).

In [37]:
matrix = matrix[matrix.date_block_num > 11]

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,city_coord_2,country_part,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
4488756,12,2,27,0.0,0,1,19,5,10,0.0,...,40.100516,3,-0.282715,1.211914,0,31,1,1,12,12
4488757,12,2,30,0.0,0,1,40,11,4,0.0,...,40.100516,3,-0.483398,1.211914,0,31,1,1,11,11
4488758,12,2,31,0.0,0,1,37,11,1,0.0,...,40.100516,3,-0.137451,1.211914,0,31,1,1,11,11
4488759,12,2,32,1.0,0,1,40,11,4,0.0,...,40.100516,3,-0.407227,1.211914,0,31,-1,1,12,12
4488760,12,2,33,1.0,0,1,37,11,1,1.0,...,40.100516,3,-0.225464,1.211914,0,31,1,1,12,12


In [40]:
def count_days(date_block_num):
    year = 2013 + date_block_num // 12
    month = 1 + date_block_num % 12
    weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0])
    days_in_month = calendar.monthrange(year, month)[1]
    return weeknd_count, days_in_month, month

map_dict = {i: count_days(i) for i in range(35)}

matrix['weeknd_count'] = matrix['date_block_num'].apply(lambda x: map_dict[x][0])
matrix['days_in_month'] = matrix['date_block_num'].apply(lambda x: map_dict[x][1])
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale,weeknd_count,days_in_month
4488756,12,2,27,0.0,0,1,19,5,10,0.0,...,-0.282715,1.211914,0,31,1,1,12,12,4,31
4488757,12,2,30,0.0,0,1,40,11,4,0.0,...,-0.483398,1.211914,0,31,1,1,11,11,4,31
4488758,12,2,31,0.0,0,1,37,11,1,0.0,...,-0.137451,1.211914,0,31,1,1,11,11,4,31
4488759,12,2,32,1.0,0,1,40,11,4,0.0,...,-0.407227,1.211914,0,31,-1,1,12,12,4,31
4488760,12,2,33,1.0,0,1,37,11,1,1.0,...,-0.225464,1.211914,0,31,1,1,12,12,4,31


In [41]:
# Fill NA value

def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df


In [42]:
matrix = fill_na(matrix)

matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,shop_category,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,...,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale,weeknd_count,days_in_month
4488756,12,2,27,0.0,0,1,19,5,10,0.0,...,-0.282715,1.211914,0,31,1,1,12,12,4,31
4488757,12,2,30,0.0,0,1,40,11,4,0.0,...,-0.483398,1.211914,0,31,1,1,11,11,4,31
4488758,12,2,31,0.0,0,1,37,11,1,0.0,...,-0.137451,1.211914,0,31,1,1,11,11,4,31
4488759,12,2,32,1.0,0,1,40,11,4,0.0,...,-0.407227,1.211914,0,31,-1,1,12,12,4,31
4488760,12,2,33,1.0,0,1,37,11,1,1.0,...,-0.225464,1.211914,0,31,1,1,12,12,4,31


In [43]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city_code',
       'shop_category', 'item_category_id', 'type_code', 'subtype_code',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'item_cnt_month_lag_4', 'item_cnt_month_lag_5', 'item_cnt_month_lag_6',
       'item_cnt_month_lag_12', 'date_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_shop_cat_avg_item_cnt_lag_1',
       'date_shop_type_avg_item_cnt_lag_1',
       'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1', 'date_type_avg_item_cnt_lag_1',
  

In [44]:
matrix.to_pickle(r'C:\Users\anshch\Documents\Demand_Forecast\Data\data_2.pkl')

In [45]:
test.to_pickle(r'C:\Users\anshch\Documents\Demand_Forecast\Data\test_2.pkl')