In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', None)
import seaborn as sns
sns.set_style(style='whitegrid')
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, date
from itertools import product
import re
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm_notebook
import pickle
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')
import time
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return df

Loading data

In [None]:
sales = pd.read_csv('data/sales_train.csv', index_col=[0])

test = pd.read_csv('data/test.csv', index_col=[0])

item_categories = pd.read_csv('data/item_categories.csv', index_col=[0])

items = pd.read_csv('data/items.csv', index_col=[0])

shops = pd.read_csv('data/shops.csv')

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=sales.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(sales.item_price.min(), sales.item_price.max()*1.1)
sns.boxplot(x=sales.item_price)

In [None]:
sales = sales[sales.item_price<100000]
sales = sales[sales.item_cnt_day<1001]

Get city

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])

sales = pd.merge(sales,shops[['shop_id', 'city']], on = 'shop_id')
test = pd.merge(test, shops[['shop_id', 'city']], on = 'shop_id')

Add item_category_id to train and test

In [None]:
item_categories['category_name'] = item_categories.index
items['name'] = items.index
items = pd.merge(items,item_categories, on = 'item_category_id')

sales = pd.merge(sales, items[['item_category_id', 'item_id']], on='item_id')
test = pd.merge(test, items[['item_category_id', 'item_id']], on='item_id')

Noticed, what in test dataset 42 unique shop_ids and 5100 item_ids, their multiplication equals lenght of test dataset (214200). That means what for test dataset were took all existing the pairs of shop and item. There are also 363 new unique items, which are not in train dataset.

There are also several shops, which have different names in dataset, but i think, what thea are the same:
<br>
'!Якутск Орджоникидзе, 56 фран' == 'Якутск Орджоникидзе, 56'
<br>
'!Якутск ТЦ "Центральный" фран' == 'Якутск ТЦ "Центральный"'
<br>
'Жуковский ул. Чкалова 39м²' == 'Жуковский ул. Чкалова 39м?'
<br>
So, change their ids in train dataset.

In [None]:
sales.loc[sales[sales.shop_id==0].index, 'shop_id'] = 57
sales.loc[sales[sales.shop_id==1].index, 'shop_id'] = 58
sales.loc[sales[sales.shop_id==11].index, 'shop_id'] = 10

test[test.shop_id==0].shop_id=57
test[test.shop_id==1].shop_id=58
test[test.shop_id==11].shop_id=10

Prepare features. Make many aggregations by shop-item-month, shop-month, item-month, city-month, category-month, city-item-month, city-category-month and shop-category-month.

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(['shop_id', 'item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=['shop_id', 'item_id', 'date_block_num']).fillna(0)

all_data = pd.merge(all_data, shops[['city', 'shop_id']],
                on='shop_id', how='left')

all_data = pd.merge(all_data, items[['item_category_id', 'item_id']],
                on='item_id', how='left')

In [None]:
# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Same as above but with city-month aggregates
gb = sales.groupby(['city', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_city':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['city', 'date_block_num']).fillna(0)

# Same as above but with category-month aggregates
gb = sales.groupby(['item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_category':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_category_id', 'date_block_num']).fillna(0)

In [None]:
# Groupby data to get city-item-month aggregates
gb = sales.groupby(['city', 'item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_city_item':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=['city', 'item_id', 'date_block_num']).fillna(0)

# Groupby data to get city-category-month aggregates
gb = sales.groupby(['city', 'item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_city_category':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=['city', 'item_category_id', 'date_block_num']).fillna(0)

# Groupby data to get shop-category-month aggregates
gb = sales.groupby(['shop_id', 'item_category_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop_category':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'item_category_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb
gc.collect();

Concatinate test dataset with 'all_data'

In [None]:
test['date_block_num']=34
all_data = pd.concat((all_data,test))

add some lag features for all aggregations

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

cols_to_rename = list(all_data.columns.difference(index_cols).difference(['city','item_category_id']))

shift_range = [1, 2, 3, 4, 5, 6, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename + ['city','item_category_id']].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift[index_cols+[f"target_lag_{month_shift}"]], on=index_cols, how='left').fillna(0)
    
    all_data = pd.merge(all_data, train_shift[['date_block_num','shop_id', f"target_shop_lag_{month_shift}"]].drop_duplicates().dropna(), on=['date_block_num','shop_id'],
                        how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','item_id', f"target_item_lag_{month_shift}"]].drop_duplicates().dropna(), on=['date_block_num','item_id'],
                        how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','item_category_id', f"target_category_lag_{month_shift}"]].drop_duplicates().dropna(), on=['date_block_num','item_category_id'],
                        how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','city', f"target_city_lag_{month_shift}"]].drop_duplicates().dropna(), on=['date_block_num','city'],
                        how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','city', 'item_category_id', f"target_city_category_lag_{month_shift}"]].drop_duplicates().dropna(),
                        on=['date_block_num','city','item_category_id'], how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','city', 'item_id', f"target_city_item_lag_{month_shift}"]].drop_duplicates().dropna(),
                        on=['date_block_num','city','item_id'], how='left').fillna(0)
    all_data = pd.merge(all_data, train_shift[['date_block_num','shop_id', 'item_category_id', f"target_shop_category_lag_{month_shift}"]].drop_duplicates().dropna(),
                        on=['date_block_num','shop_id','item_category_id'], how='left').fillna(0)
del train_shift


In [None]:
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12]

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
#item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

#all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

add month, city and name of shop to dataset and make dummy city features

In [None]:
all_data['month'] = all_data.date_block_num%12

all_data = pd.merge(all_data, pd.get_dummies(all_data.city), left_index=True, right_index=True)

Make text features from item names and item category names

In [None]:
def text_proc(x):
    # clear text
    text = re.sub(r'[\W_]+', u' ', x.lower(), flags=re.UNICODE)
    text = re.sub(u'\d', u' ',text, flags=re.UNICODE).strip()
    return text

In [None]:
all_data = pd.merge(all_data, shops[['shop_id', 'shop_name']], on = 'shop_id')

In [None]:
# item names
all_data.shop_name = all_data.shop_name.apply(text_proc)

feature_cnt = 30
tfidf = TfidfVectorizer(max_features=feature_cnt)
txtFeatures = pd.DataFrame(tfidf.fit_transform(all_data['shop_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
    all_data['item_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
all_data.head()

In [None]:
# item category names
all_data = pd.merge(all_data,item_categories[['item_category_id', 'category_name']], on = 'item_category_id', how = 'left')

all_data.category_name = all_data.category_name.apply(text_proc)
feature_cnt = 15
tfidf = TfidfVectorizer(max_features=feature_cnt)
txtFeatures = pd.DataFrame(tfidf.fit_transform(all_data['category_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
    all_data['category_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
all_data.head()

add additional features of length of item names and category names

In [None]:
all_data['item_name_len'] = all_data['shop_name'].map(len)
all_data['item_name_wc'] = all_data['shop_name'].map(lambda x: len(str(x).split(' ')))

all_data['category_name_len'] = all_data['category_name'].map(len)
all_data['category_name_wc'] = all_data['category_name'].map(lambda x: len(str(x).split(' ')))

In [None]:
# all_data target distribution
sns.kdeplot(all_data.target)
plt.show()

Print several random chosen time series

In [None]:
plt.figure(figsize=(12,6))
pairs = all_data[['item_id','shop_id']].drop_duplicates()
for i in range(10):
    rand = np.random.randint(0,len(pairs))
    all_data[(all_data.shop_id==pairs.iloc[rand].shop_id)&(all_data.item_id==pairs.iloc[rand].item_id)].target.plot()

In [None]:
all_data.to_csv('dataset.csv', index=False)

Train/test split

In [None]:
to_drop_cols.append('shop_name')
to_drop_cols.append('category_name')

In [None]:
dates = all_data['date_block_num']

dates_train = dates[dates <  33]
dates_val = dates[dates ==  33]
dates_test  = dates[dates == 34]

X_train = all_data.loc[dates <  33].drop(to_drop_cols, axis=1)
X_val =  all_data.loc[dates == 33].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == 34].drop(to_drop_cols, axis=1)

scaler = MinMaxScaler()
scaler.fit_transform(X_train)
scaler.transform(X_val)
scaler.transform(X_test)

y_train = all_data.loc[dates <  33, 'target'].values
y_val =  all_data.loc[dates == 33, 'target'].values

XGboost

In [None]:
ts = time.time()

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

xgb_val_pred = model.predict(X_val)
xgb_test_pred = model.predict(X_test)

LGBM

In [None]:
lgbmodel=LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

lgbmodel.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
             early_stopping_rounds=10, eval_metric="l2_root")

lgb_val_pred = model.predict(X_val)
lgb_test_pred = model.predict(X_test)

Random Forest

In [None]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(X_train, y_train)

rf_val_pred = rf_model.predict(X_val)
rf_test_pred = rf_model.predict(X_test)

Linear Model

In [None]:
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(X_train, y_train)

lr_val_pred = lr_model.predict(X_val)
lr_test_pred = lr_model.predict(X_test)

KNN Model

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
knn_model.fit(X_train, y_train)

knn_val_pred = knn_model.predict(X_val)
knn_test_pred = knn_model.predict(X_test)

Ensembling of models

In [None]:
first_level = pd.DataFrame(lgb_val_pred, columns=['lgbm'])
first_level['xgbm'] = xgb_val_pred
first_level['random_forest'] = rf_val_pred
first_level['linear_regression'] = lr_val_pred
first_level['knn'] = knn_val_pred
first_level['label'] = y_val

first_level_test = pd.DataFrame(lgb_test_pred, columns=['lgbm'])
first_level_test['xgbm'] = xgb_test_pred
first_level_test['random_forest'] = rf_test_pred
first_level_test['linear_regression'] = lr_test_pred
first_level_test['knn'] = knn_test_pred
first_level_test.head(5)

meta_model = LinearRegression(n_jobs=-1)
first_level.drop('label', axis=1, inplace=True)
meta_model.fit(first_level, y_val)

ensemble_pred = meta_model.predict(first_level)
final_predictions = meta_model.predict(first_level_test)

prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = final_predictions.clip(0., 20.)
prediction_df.to_csv('submission.csv', index=False)
prediction_df.head(10)