In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/items', './input/shops', './input/item_categories',
                './input/sales_train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

# Future sales prediction challenge

1. [Importing libs, add helper functions and adding data to ram.](#preparing)
2. [Data cleaning and feature engeneering](#fe)
3. [Models learning](#models)

4. [References](#references)

There are no leakage searching, I don't like it and think, that it's against the spirit of competition.

Notes:
This notebook is my decision of capstone project for How to Win DataScience Competitions course. 


## 1. Importing libs, add helper functions and adding data to ram.
<a id='preparing'></a>

### Libs importing

In [1]:
import gc
import os
import sys
import time
import pickle
import random
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
# ALEX: remove plotting
# from numba import jit

# import seaborn as sns
from itertools import product
# ALEX: remove plotting
# import matplotlib.pyplot as plt
# from collections import OrderedDict
from tqdm import tqdm_notebook as tqdm


# ML libs
# ALEX: remove ML code
# from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
# from sklearn.model_selection import KFold, ShuffleSplit
# from sklearn import metrics

# # Models which were used
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge
# from catboost import CatBoostRegressor
# import lightgbm as lgbm

# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# from torch.utils.data import Dataset, DataLoader
# from torchvision import transforms


# # friendship of matplotlib and jupyter
# %matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
# ALEX: remove path printing
# print(os.listdir("./input"))
# print(sys.version_info)

# Fixing seeds
np.random.seed(0)
random.seed(0)
RANDOM_SEED = 0

### Helpers

In [2]:
# ALEX: remove ML code
# @jit
# def rmse(predictions, targets):
#     return np.sqrt(((predictions - targets) ** 2).mean())

# def trainer(X,
#                 X_test,
#                 y,
#                 params,
#                 folds,
#                 columns=None,
#                 eval_metric='mae'
#                 ):
# #     columns = X.columns if columns is None else columns

#     result_dict = {}

#     # out-of-fold predictions on train data
#     oof = np.zeros(len(X))

#     # averaged predictions on test data
#     prediction = np.zeros(len(X_test))

#     # list of scores on folds
#     scores = []
#     feature_importance = pd.DataFrame()

#     # to set up scoring parameters
#     metrics_dict = {'mae': {'lgb_metric_name': 'mae',
#                             'catboost_metric_name': 'MAE',
#                             'sklearn_scoring_function': metrics.mean_absolute_error},
#                     'group_mae': {'lgb_metric_name': 'mae',
#                                   'catboost_metric_name': 'MAE'
#                                  },
#                     'mse': {'lgb_metric_name': 'mse',
#                             'catboost_metric_name': 'MSE',
#                             'sklearn_scoring_function': metrics.mean_squared_error}
#                     }

#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

#         print(f'Fold {fold_n + 1} started at {time.ctime()}')

#         model = lgbm.LGBMRegressor(**params, n_jobs = -1)
            
#         if type(X) == np.ndarray:
#             X_train, X_valid = X[train_index], X[valid_index]
#             y_train, y_valid = y[train_index], y[valid_index]
#         else:
#             X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
#             y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
#         model.fit(X_train, y_train, 
#                     eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
#                     verbose=0, early_stopping_rounds=300)
            
#         y_pred_valid = model.predict(X_valid)
#         y_pred = model.predict(X_test, num_iteration=model.best_iteration_)   


#         oof[valid_index] = y_pred_valid.reshape(-1, )
#         if eval_metric != 'group_mae':
#             scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
#         else:
#             scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

#         prediction += y_pred

#     prediction /= folds.n_splits

#     print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

#     result_dict['oof'] = oof
#     result_dict['prediction'] = prediction
#     result_dict['scores'] = scores

#     return result_dict

def print_info(dataset, name):
    s = "\n" + "-"*70 + '\n'
    
    print(f"{s}",f"Info about {name} with shape {dataset.shape}",
          f"{s}", dataset.head(),
          f"{s}", dataset.nunique(),
          f"{s}", dataset.describe().astype('int32'),
          f"{s}",'Count NaN values \n', dataset.isna().sum(),
         )

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

### Data loading

In [3]:
nrows = None
items = pd.read_csv('./input/items.scaled.csv')
shops = pd.read_csv('./input/shops.scaled.csv')
cats = pd.read_csv('./input/item_categories.scaled.csv')
# ALEX: sales_train_v2.csv not present
# try:
#     train = pd.read_csv('./input/sales_train_v2.csv', nrows=nrows)
# except:
train = pd.read_csv('./input/sales_train.scaled.csv', nrows=nrows)
# set index to ID to avoid droping it later
test  = pd.read_csv('./input/test.scaled.csv').set_index('ID')

## 2 Data cleaning and feature engeneering
<a if="fe"></a>

### Work with train/test data

In [4]:
print_info(train, 'train')
print_info(test, 'test')


----------------------------------------------------------------------
 Info about train with shape (2935849, 6) 
----------------------------------------------------------------------
          date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0 
----------------------------------------------------------------------
 date               1034
date_block_num       34
shop_id              60
item_id           21807
item_price        19993
item_cnt_day        198
dtype: int64 
----------------------------------------------------------------------
        date_block_num  shop_id  item_id  item_price  item_cnt_day


There are:
* no NaN values.
* 34 time period in train and 1 in test
* min in train: -22 items per day (outliers or returned things?)
* max item price: 59k and max items_cnt_day: 1k (outliers? noise?)
* 60 shops in train and 42 in test (do we need 18 untestable shops?)

In [5]:
print(f"{len(set(train.shop_id) - set(test.shop_id))} shops only in train.")
print(f"{len(train.item_cnt_day[train.item_cnt_day < 0])} negative items_cnt_day")
print(f"We need to predict clipped into [0, 20] range item_cnt_month column")

18 shops only in train.
7356 negative items_cnt_day
We need to predict clipped into [0, 20] range item_cnt_month column


In [6]:
# Fixing outliers
train = train[train.item_price<48000]
train = train[train.item_cnt_day<1000]

In [7]:
# Item count per day (target) visualization

# ALEX: remove plotting
# plt.figure(figsize=(20,5))
# fig, ax = plt.subplots(figsize=(20,5))
# g = sns.boxplot(train.item_cnt_day, palette="Set3", ax=ax)
# plt.show()
_ = train.item_cnt_day

In [8]:
# Costs of each product
# ALEX: remove plotting
# plt.figure(figsize=(20,5))
# fig, ax = plt.subplots(figsize=(20,5))
# g = sns.boxplot(train.item_price, palette="Set3", ax=ax)
# plt.show()
_ = train.item_price

### Work with categories

In [9]:
print_info(cats, "Categories")


----------------------------------------------------------------------
 Info about Categories with shape (84, 2) 
----------------------------------------------------------------------
         item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0
1         Аксессуары - PS2                 1
2         Аксессуары - PS3                 2
3         Аксессуары - PS4                 3
4         Аксессуары - PSP                 4 
----------------------------------------------------------------------
 item_category_name    84
item_category_id      84
dtype: int64 
----------------------------------------------------------------------
        item_category_id
count                84
mean                 41
std                  24
min                   0
25%                  20
50%                  41
75%                  62
max                  83 
----------------------------------------------------------------------
 Count NaN values 
 item_category_name    0
it

In [10]:
cats

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4
5,Аксессуары - PSVita,5
6,Аксессуары - XBOX 360,6
7,Аксессуары - XBOX ONE,7
8,Билеты (Цифра),8
9,Доставка товара,9


* We have category - subcategory format (features), but 32 row has a little mistake

In [11]:
# Create category, subcategory features
cats.loc[32, "item_category_name"] = 'Карты оплаты - (Кино, Музыка, Игры)'
cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
# ALEX: remove ML code
# cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
cats['type_code'] = cats['type']
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
# ALEX: remove ML code
# cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats['subtype_code'] = cats['subtype']
cats = cats[['item_category_id','type_code', 'subtype_code']]
cats

Unnamed: 0,item_category_id,type_code,subtype_code
0,0,PC,Гарнитуры/Наушники
1,1,Аксессуары,PS2
2,2,Аксессуары,PS3
3,3,Аксессуары,PS4
4,4,Аксессуары,PSP
5,5,Аксессуары,PSVita
6,6,Аксессуары,XBOX 360
7,7,Аксессуары,XBOX ONE
8,8,Билеты (Цифра),Билеты (Цифра)
9,9,Доставка товара,Доставка товара


### Work with shops

In [12]:
print_info(shops, "Shops")


----------------------------------------------------------------------
 Info about Shops with shape (60, 2) 
----------------------------------------------------------------------
                         shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4 
----------------------------------------------------------------------
 shop_name    60
shop_id      60
dtype: int64 
----------------------------------------------------------------------
        shop_id
count       60
mean        29
std         17
min          0
25%         14
50%         29
75%         44
max         59 
----------------------------------------------------------------------
 Count NaN values 
 shop_name    0
shop_id      0
dtype: int64


In [13]:
shops

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


There are:
* First word is city name
* Second separated word is type of shop (not everywhere)
* few incorrect fields
* one type is delivery
* Two Якутск duplicates. Will fix it later

In [14]:
# Fixing some names
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
# ALEX: remove ML code
# shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops['city_code'] = shops['city']
shops = shops[['shop_id','city_code']]

In [15]:
# Histogram of shops
xticks = [set(train.shop_id.unique()).union(test.shop_id.unique())]
# ALEX: remove ML code
# plt.figure(figsize=(20,10))
# fig, ax = plt.subplots(figsize=(20,10))
# g = sns.distplot(train.shop_id, kde=False, label="Train", ax=ax)
# sns.distplot(test.shop_id, kde=False, label="Test")
# plt.legend()
# plt.show()
_ = train.shop_id
_ = test.shop_id

### Work with items

In [16]:
print_info(items, 'Items')


----------------------------------------------------------------------
 Info about Items with shape (22170, 3) 
----------------------------------------------------------------------
                                            item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  
0                40  
1                76  
2                40  
3                40  
4                40   
----------------------------------------------------------------------
 item_name           22170
item_id             22170
item_category_id       84
dtype: int64 
----------------------------------------------------------------------
        item_id  item_category_id
count    22170        

In [17]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


* Okay, there are lots items.

### Feature engeneering

#### Steps:
1. Prepare train to the test view (we have items count per day, but need items count per month
1. Merge all suplementary data.

In [18]:
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [19]:
# Make matrix of all possible combinations of shops and items.
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], 
                                sales.shop_id.unique(), 
                                sales.item_id.unique())), 
                                dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

In [20]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))

In [21]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [22]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [23]:
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,2,19,0.0
1,0,2,27,1.0
2,0,2,28,0.0
3,0,2,29,0.0
4,0,2,32,0.0


In [24]:
# Add Shops, Items, Categories features
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
# ALEX: remove ML code
# matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['city_code'] = matrix['city_code']
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
# ALEX: remove ML code
# matrix['type_code'] = matrix['type_code'].astype(np.int8)
# matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
matrix['type_code'] = matrix['type_code']
matrix['subtype_code'] = matrix['subtype_code']
matrix = matrix.drop(['item_name'], axis=1)

In [25]:
# I used here smaller lag (second). Reason: RAM. But it allows me achieve lower score.

# lookback_range = list(range(1, 33 + 1)) # Takes lots of RAM
lookback_range = list(range(1, 7 + 1))
new_features = []

# Add lag features

# Add previous shop/item sales as feature
# How much item sell in past time wrt to each shop
for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_sales_' + str(diff)
    mx2 = matrix.copy()
    mx2.loc[:, 'date_block_num'] += diff
    mx2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    matrix = matrix.merge(mx2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    matrix[feature_name] = matrix[feature_name].fillna(0)
    new_features.append(feature_name)

# Add previous item sales as feature
# How much item sell in past time
groups = matrix.groupby(by = ['item_id', 'date_block_num'])
for diff in tqdm(lookback_range):
    feature_name = 'prev_item_sales_' + str(diff)
    result = groups.agg({'item_cnt_month':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    matrix = matrix.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    matrix[feature_name] = matrix[feature_name].fillna(0)
    new_features.append(feature_name)

groups = matrix.groupby(by = ['item_id', 'date_block_num'])
for diff in tqdm(lookback_range):
    feature_name = '_prev_item_sales_' + str(diff)
    result = groups.agg({'item_cnt_month':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    matrix = matrix.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    matrix[feature_name] = matrix[feature_name].fillna(0)
    new_features.append(feature_name)  

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for diff in tqdm(lookback_range):


  0%|          | 0/7 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for diff in tqdm(lookback_range):


  0%|          | 0/7 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for diff in tqdm(lookback_range):


  0%|          | 0/7 [00:00<?, ?it/s]

#### Mean encoding of some columns

In [26]:
# This part of code may be a little bit unclear, but it has very simple logic:
# First: count unique values in chosen categorical column
# Second: divide each unique value by lengh of column (here we have a mean)
# Third: create column with mapped column <-> mean values

me_cols = ['shop_id', 'item_id', 'city_code', 'item_category_id', 'type_code', 'subtype_code']
for cl in me_cols:
    me_col_name = "me_" + cl
    matrix.loc[:, me_col_name] = matrix[cl].map(matrix[cl].value_counts().apply(lambda x: x/len(matrix[cl])))

In [27]:
matrix.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_4,prev_shopitem_sales_5,prev_shopitem_sales_6,prev_shopitem_sales_7,prev_item_sales_1,prev_item_sales_2,prev_item_sales_3,prev_item_sales_4,prev_item_sales_5,prev_item_sales_6,prev_item_sales_7,_prev_item_sales_1,_prev_item_sales_2,_prev_item_sales_3,_prev_item_sales_4,_prev_item_sales_5,_prev_item_sales_6,_prev_item_sales_7,me_shop_id,me_item_id,me_city_code,me_item_category_id,me_type_code,me_subtype_code
0,0,2,19,0.0,Адыгея,40,Кино,DVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021478,4e-06,0.021478,0.227228,0.373245,0.227228
1,0,2,27,1.0,Адыгея,19,Игры,PS3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021478,6.6e-05,0.021478,0.030764,0.08385,0.036587
2,0,2,28,0.0,Адыгея,30,Игры PC,Стандартные издания,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021478,5.3e-05,0.021478,0.034609,0.085514,0.034609
3,0,2,29,0.0,Адыгея,23,Игры,XBOX 360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021478,2.9e-05,0.021478,0.023493,0.08385,0.028004
4,0,2,32,0.0,Адыгея,40,Кино,DVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021478,0.000146,0.021478,0.227228,0.373245,0.227228


In [28]:
# Reducing of memory usage
# ALEX: remove GC code
# import gc
del items, shops, cats, train

matrix = reduce_mem_usage(matrix)
test = reduce_mem_usage(test)

Mem. usage decreased to 986.95 Mb (40.8% reduction)
Mem. usage decreased to  2.45 Mb (0.0% reduction)


## 3 Models learning <a id="models"></a>

1. SKlearn linear models
2. Sklearn RandomForestRegressor
3. Lightgbm regressor


In [29]:
X_train = matrix[matrix.date_block_num <= 33].drop(['item_cnt_month'], axis=1)
y = matrix[matrix.date_block_num <= 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)
del matrix
# ALEX: remove GC code
# gc.collect()

In [30]:
# Before we start to fit models, we need to preprocess our dataset (Not necessary for forests).
# ALEX: remove ML code
# x_stsc = StandardScaler().fit(pd.concat([X_train, X_test]))
# X_train = x_stsc.transform(X_train)
# X_test = x_stsc.transform(X_test)
_ = pd.concat([X_train, X_test])

In [31]:
# ALEX: remove ML code
# cv = KFold(3, shuffle=True)

### Few Linear models 

In [32]:
# ALEX: remove ML code
# def linear_model(
#                 X,
#                 X_test,
#                 y,
#                 model,
#                 params,
#                 folds,
#                 columns=None):

#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

#         print(f'Fold {fold_n + 1} started at {time.ctime()}')

#         result_dict = {}

#         # out-of-fold predictions on train data
#         oof = np.zeros(len(X))

#         # averaged predictions on test data
#         prediction = np.zeros(len(X_test))

#         # list of scores on folds
#         scores = []
#         feature_importance = pd.DataFrame()

#         X_train, X_valid = X[train_index], X[valid_index]
#         y_train, y_valid = y[train_index], y[valid_index]

#         model.fit(X_train, y_train)

#         y_pred_valid = model.predict(X_valid)
#         y_pred = model.predict(X_test)

#         oof[valid_index] = y_pred_valid.reshape(-1, )
        
#         scores.append(rmse(y_valid, y_pred_valid))

#         prediction += y_pred
        
#         gc.collect()
    
#     prediction /= folds.n_splits

#     print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

#     result_dict['oof'] = oof
#     result_dict['prediction'] = prediction
#     result_dict['scores'] = scores

#     return result_dict

In [33]:
# ALEX: remove ML code
# linreg_best_params= {
#     'n_jobs': -1
# }
# ridge_best_params = {
#     'alpha': 0.7,
#     'solver': 'lsqr',
#     'random_state': RANDOM_SEED
# }
# lr_model = LinearRegression(**linreg_best_params)
# ridge_model = Ridge(**ridge_best_params)

# linreg_results = linear_model(X_train, X_test, y,
#                              lr_model,
#                              linreg_best_params,
#                              cv)

# ridge_results = linear_model(X_train, X_test, y,
#                              ridge_model,
#                              linreg_best_params,
#                              cv)


In [34]:
# ALEX: remove GC code
# gc.collect()

### RandomForestRegressor

In [35]:
# ALEX: remove ML code
# def random_forest_regressor(
#                 X,
#                 X_test,
#                 y,
#                 params,
#                 folds,
#                 columns=None):

#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

#         print(f'Fold {fold_n + 1} started at {time.ctime()}')

#         result_dict = {}

#         # out-of-fold predictions on train data
#         oof = np.zeros(len(X))

#         # averaged predictions on test data
#         prediction = np.zeros(len(X_test))

#         # list of scores on folds
#         scores = []
#         feature_importance = pd.DataFrame()


#         if type(X) == np.ndarray:
#             X_train, X_valid = X[train_index], X[valid_index]
#             y_train, y_valid = y[train_index], y[valid_index]
#         else:
#             X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
#             y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]



#         model = RandomForestRegressor(**params)

#         model.fit(X_train, y_train)

#         y_pred_valid = model.predict(X_valid)
#         y_pred = model.predict(X_test)

#         oof[valid_index] = y_pred_valid.reshape(-1, )
        
#         scores.append(rmse(y_valid, y_pred_valid))

#         prediction += y_pred
#         gc.collect()
#     prediction /= folds.n_splits

#     print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

#     result_dict['oof'] = oof
#     result_dict['prediction'] = prediction
#     result_dict['scores'] = scores
#     gc.collect()
#     return result_dict



In [36]:
# Here I use smaller number of estimators. 4 instead 20. I decided to do this, wrt to number of cores in notebook.
# rf_best_params= {
#  'bootstrap': True,
#  'max_depth': 70,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 4,
#  'n_jobs': -1,
#  'verbose': 2
# }

# ALEX: remove ML code
# rf_best_params= {
#  'bootstrap': True,
#  'max_depth': 40,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 4,
#  'n_jobs': -1,
#  'verbose': 2
# }

# rf_results = random_forest_regressor(X_train, X_test, y, 
#                                      rf_best_params,
#                                      cv)

## Lightgbm

In [37]:
# ALEX: remove ML code
# params= {
#     'num_leaves': 64,
#     "max_depth": 8,
#     'max_bin': 32,
#     "iterations": 200,
#     "data_random_seed":0,
#     'n_estimators': 1000
# }
# lgb_results = trainer(X_train, X_test, y, params, cv)

## Stacking
Didn't improve my submission

In [38]:
# ALEX: remove ML code
# oof_preds = np.array([ridge_results['oof'],
#                       linreg_results['oof'],
#                       rf_results['oof'],
#                       lgb_results['oof']
#                      ]).T

# preds = np.array([ridge_results['prediction'],
#                   linreg_results['prediction'],
#                   rf_results['prediction'],
#                   lgb_results['prediction']
#                  ]).T

In [39]:
# Read all generated predictions

# ALEX: remove ML code
# lvl2_model = LinearRegression()
# lvl2_model.fit(oof_preds, y)
# lvl2_model.score(oof_preds, y)
# lvl2_result = lvl2_model.predict(preds)

In [41]:
# ALEX: remove ML code
# submission = pd.DataFrame({
#     "ID": test.index, 
#     "item_cnt_month": lvl2_result
# })
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": test.index
})
submission['item_cnt_month'] = submission['item_cnt_month'].clip(0, 20)
# submission.to_csv('lvl2.1.csv',index=False)
submission.to_csv('submission.csv',index=False)

## 4 References <a id="references"></a>
I really appreciate these sources of code and people who write this. I definetly recommend you to check this links.

1. [Feature engineering, xgboost](https://www.kaggle.com/dlarionov/feature-engineering-xgboost)
2. [Predict Future Sales Top 11 Solution](https://www.kaggle.com/szhou42/predict-future-sales-top-11-solution)
3. [Artgor utils](https://www.kaggle.com/artgor/artgor-utils)

4. [Hyperparameter Tuning the Random Forest in Python](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)



## 5 Tried, but didn't work

I didn't end this pipline, because when I tested my NN it shows too high RMSE score.
But if you want to run this, just comment rows with test predictions.

In [None]:
# !pip install pytorch-ignite

In [None]:
# from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
# from ignite.metrics import RootMeanSquaredError, Loss
# from ignite.handlers import ModelCheckpoint, EarlyStopping
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
# os.environ["CUDA_VISIBLE_DEVICES"]="2"

# class FSPNet(nn.Module):
#     def __init__(self, input_dim):
#         super(FSPNet, self).__init__()
        
#         self.l1 = nn.Linear(input_dim, 512)
#         self.l2 = nn.Linear(512, 1024)
#         self.l3 = nn.Linear(1024, 256)
#         self.l4 = nn.Linear(256, 1)
        
#     def forward(self, x):
#         x = F.relu(self.l1(x))
#         x = F.relu(self.l2(x))
#         x = F.relu(self.l3(x))
#         x = self.l4(x)
#         return x
    
# class FSPDataset(Dataset):

#     def __init__(self, X, y):
#         self.X = X
#         self.y = y
        
#     def __len__(self):
#         return len(self.y)
    
#     def __getitem__(self, idx):
#         return torch.tensor(self.X[idx]).float(), torch.tensor(self.y[idx]).float()

# class FSPDataset_test(Dataset):
#     def __init__(self, X):
#         self.X = X
    
#     def __len__(self):
#         return len(self.X)
    
#     def __getitem__(self, idx):
#         return torch.tensor(self.X[idx]).float()

In [None]:
# def mlp_trainer(
#                 X,
#                 X_test,
#                 y,
#                 folds,
#                 epochs=15):

#     for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

#         print(f'Fold {fold_n + 1} started at {time.ctime()}')

#         result_dict = {}

#         # out-of-fold predictions on train data
#         oof = np.zeros(len(X))

#         # averaged predictions on test data
#         prediction = np.zeros(len(X_test))

#         # list of scores on folds
#         scores = []
        
#         y = y.values
#         if type(X) == np.ndarray:
#             X_train, X_valid = X[train_index], X[valid_index]
#             y_train, y_valid = y[train_index], y[valid_index]
#         else:
            
#             X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
#             y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


#         train_dataset = FSPDataset(X_train, y_train)
#         valid_dataset = FSPDataset(X_valid, y_valid)
        
#         train_loader = DataLoader(train_dataset, batch_size=512, 
#                                   shuffle=True, num_workers=5)
#         valid_loader = DataLoader(valid_dataset, batch_size=512,
#                                   shuffle=True, num_workers=5)
        
#         # init model and move to CPU or GPU
#         model = FSPNet(X_train.shape[1])
#         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#         print(device)
#         model.to(device)
        
#         # Init optimizers
#         optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#         criterion = nn.MSELoss()
        
#         trainer = create_supervised_trainer(model, 
#                                             optimizer, 
#                                             criterion, 
#                                             device=device)
        
#         metrics = {
#             'RMSE':RootMeanSquaredError(),
#             'MSELoss':Loss(criterion)

#         }
#         evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
#         training_history = {'RMSE':[],'loss':[]}
#         validation_history = {'RMSE':[],'loss':[]}
#         last_epoch = []
        
#         RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
        
#         def score_function(engine):
#             val_loss = engine.state.metrics['MSELoss']
#             return -val_loss

#         handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
#         evaluator.add_event_handler(Events.COMPLETED, handler)
        
#         @trainer.on(Events.EPOCH_COMPLETED)
#         def log_training_results(trainer):
#             evaluator.run(train_loader)
#             metrics = evaluator.state.metrics
#             accuracy = metrics['RMSE']
#             loss = metrics['MSELoss']
#             last_epoch.append(0)
#             training_history['RMSE'].append(accuracy)
#             training_history['loss'].append(loss)
#             print("Training Results - Epoch: {}  Avg RMSE: {:.2f} Avg MSELoss: {:.2f}"
#                   .format(trainer.state.epoch, accuracy, loss))

#         def log_validation_results(trainer):
#             evaluator.run(valid_loader)
#             metrics = evaluator.state.metrics
#             accuracy = metrics['RMSE']
#             loss = metrics['MSELoss']
#             validation_history['RMSE'].append(accuracy)
#             validation_history['loss'].append(loss)
#             print("Validation Results - Epoch: {}  Avg RMSE: {:.2f} Avg MSELoss: {:.2f}"
#                   .format(trainer.state.epoch, accuracy, loss))

#         trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
        
#         print('Training started...')
#         trainer.run(train_loader, max_epochs=epochs)
                
        
        
#         y_pred_valid = model.forward(torch.tensor(X_valid).float().to('cpu'))
        
#         y_pred = model.forward(torch.tensor(X_test).float().to('cpu'))

#         oof[valid_index] = y_pred_valid.detach().numpy().reshape(-1, )
        
#         scores.append(rmse(y_valid, y_pred_valid.detach().numpy()))

#         prediction += y_pred

#     prediction /= folds.n_splits

#     print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

#     result_dict['oof'] = oof
#     result_dict['prediction'] = prediction
#     result_dict['scores'] = scores

In [None]:
# mlp_trainer(X_train,
#             X_test, y,
#             cv, epochs=1)
