# Import packages and load dataframes

In [1]:
import pandas as pd
import os
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import regex
from itertools import product
import gc
from tqdm.notebook import tqdm as tqdm_notebook
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import mlflow

In [2]:
DATA_FOLDER = 'data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_cats = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))

# Define some helper functions

In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

# Feature Extraction

## Create the base training dataframe

In [4]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
all_data = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

## Aggregate sales and revenue by month, item id and shop id.

Add item revenue as a feature to the transactions dataframe.

In [5]:
sales['item_revenue'] = sales.item_cnt_day * sales.item_price

Aggregate sales.

In [6]:
# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
# Fix column names
gb.columns = ['target' if col == 'item_cnt_day' else col for col in gb.columns.values]
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.columns = ['target_shop' if col == 'item_cnt_day' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.columns = ['target_item' if col == 'item_cnt_day' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
# del grid, gb 
gc.collect();

Aggregate revenue.

In [7]:
# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_revenue':'sum'})
# Fix column names
gb.columns = ['revenue' if col == 'item_revenue' else col for col in gb.columns.values]
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_revenue':'sum'})
gb.columns = ['revenue_shop' if col == 'item_revenue' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_revenue':'sum'})
gb.columns = ['revenue_item' if col == 'item_revenue' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
# del grid, gb 
gc.collect();

## Normalize numerical features

We choose to normalize using a min-max scaler.

In [8]:
# Create a list of numerical columns to normalize
num_cols = [col for col in all_data.columns if col.startswith('target') or col.startswith('revenue')]

# Normalize the columns
for col in num_cols:
    col_min = all_data[col].min()
    col_max = all_data[col].max()
    all_data[col + '_normalized'] = (all_data[col] - col_min) / (col_max - col_min)

Prepare to drop unnormalized columns except for 'target'.

In [9]:
cols_to_drop = [col for col in num_cols if col != 'target']

## Add values from previous months as features

Create new features using lags from [1, 2, 3, 4, 5, 12] months ago.

In [10]:
# List of columns that we will use to create lags
cols_to_rename = [col for col in all_data.columns if col.endswith('normalized')]

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

  0%|          | 0/6 [00:00<?, ?it/s]

Normalized targets and revenues without a lag are not available as features in the test dataset, so we will drop them during training.

In [11]:
cols_to_drop = cols_to_drop + [col + '_normalized' for col in num_cols]

## Extract text-based features

### Stem the text

Define a stemmer that can handle both Russian and English text using nltk's Snowball Stemmer.

In [12]:
en_stemmer = SnowballStemmer('english', ignore_stopwords=True)
ru_stemmer = SnowballStemmer('russian', ignore_stopwords=True)

cyr_regex = regex.compile('\p{Cyrillic}+', regex.UNICODE)
lat_regex = regex.compile('\p{Latin}+', regex.UNICODE)

en_stopwords = stopwords.words("english")
ru_stopwords = stopwords.words("russian")

In [13]:
def clean_text(text):
    """ Removes punctuation from string, unwanted unicode characters, and numbers. Returns in lowercase.
    
    Args:
        text (str): The text to clean.
    
    Returns:
        The cleaned text after filtered by the regex expression and made lowercase.
    
    For more information on the unicode categories used in the regex expression see here:
    https://www.regular-expressions.info/unicode.html#category
    
    >>> clean_text("!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ Can't, - Trademark™ ...「（Punctuation）」42.32 ?")
    cant trademark punctuation
    
    """
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    # remove apostrophes 
    text = text.replace("'", "")
    
    # Define regex unicode Categories and strip from string
    remove = regex.compile('[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}|\p{N}]+', regex.UNICODE)
    text = remove.sub(" ", text).strip()
    
    # make lowercase
    text = text.lower()
    
    return text

Apply clean_text to stopwords to make removal easier in the preprocess_text function below.

In [14]:
cleaned_en_stopwords = [clean_text(word) for word in en_stopwords]
cleaned_ru_stopwords = [clean_text(word) for word in ru_stopwords]

In [15]:
def preprocess_text(text):
    """Identify the words written in Cyrillic and Latin characters in a string,
    remove stop words and apply a Russian or English stemmer, respectively.
    
    Args:
        text(str): The string whose Cyrillic and Latin text will be stemmed.
    
    Returns:
        A stemmed version of the text.
    """
    if text is None:
        return []
    
    text = clean_text(text)

    words = re.split('\s', text)
    stemmed_sentence = ''
    for word in words:
        ru = regex.search(cyr_regex, word)
        en = regex.search(lat_regex, word)
        if ru:
            # drop stopwords from the sentence
            if word in cleaned_ru_stopwords:
                continue
            stemmed_word = ru_stemmer.stem(word)
        elif en:
            # drop stopwords from the sentence
            if word in cleaned_en_stopwords:
                continue
            stemmed_word = en_stemmer.stem(word)
        else:
            stemmed_word = word
        stemmed_sentence = stemmed_sentence + ' ' + stemmed_word
    
    return stemmed_sentence[1:]

Demonstrate function on sample text from the dataset.

In [16]:
text = '(Кино) - Blu-Ray'

preprocess_text(text)

'кин blu ray'

Apply stemmer to columns containing text.

In [17]:
shops['preprocessed_shop_name'] = shops['shop_name'].map(preprocess_text)
items['preprocessed_item_name'] = items['item_name'].map(preprocess_text)
item_cats['preprocessed_item_category_name'] = item_cats['item_category_name'].map(preprocess_text)

### Vectorize using TF-IDF

In [18]:
vectorizer = TfidfVectorizer(min_df=3)

In [19]:
tfidf_shop_names = vectorizer.fit_transform(shops['preprocessed_shop_name'].values)
tfidf_shop_names = pd.DataFrame.sparse.from_spmatrix(tfidf_shop_names)
tfidf_shop_names.columns = ['vect_shop_name_' + str(col) for col in tfidf_shop_names.columns]
shops = shops.join(tfidf_shop_names)

tfidf_item_names = vectorizer.fit_transform(items['preprocessed_item_name'].values)
tfidf_item_names = pd.DataFrame.sparse.from_spmatrix(tfidf_item_names)
tfidf_item_names.columns = ['vect_item_name_' + str(col) for col in tfidf_item_names.columns]
items = items.join(tfidf_item_names)

tfidf_item_cat_names = vectorizer.fit_transform(item_cats['preprocessed_item_category_name'].values)
tfidf_item_cat_names = pd.DataFrame.sparse.from_spmatrix(tfidf_item_cat_names)
tfidf_item_cat_names.columns = ['vect_item_category_name_' + str(col) for col in tfidf_item_cat_names.columns]
item_cats = item_cats.join(tfidf_item_cat_names)

### Join TFIDF-encoded item/item category/shop names onto the training dataframe

In [20]:
# vect_shop_cols = [col for col in shops.columns if col.startswith('vect')]
# all_data = all_data.join(shops[vect_shop_cols], on='shop_id', how='left')

# vect_item_cols = [col for col in items.columns if col.startswith('vect')]
# all_data = all_data.join(items[vect_item_cols], on='item_id', how='left')

# vect_item_cat_cols = [col for col in item_cats.columns if col.startswith('vect')]
# all_data = all_data.join(item_cats[vect_item_cat_cols], on='item_category_id', how='left')

## Mean encode categorical features

In [21]:
kf = KFold(n_splits=5, shuffle=False)

cat_cols = ['shop_id', 'item_id', 'item_category_id']

for col in cat_cols:
    all_data[col + '_enc'] = np.nan

    for train_index, val_index in kf.split(all_data):
        train_data, val_data = all_data.iloc[train_index].copy(), all_data.iloc[val_index].copy()
        target_mean = train_data.groupby(col).target.mean()
        val_data[col + '_enc'] = val_data[col].map(target_mean)
        all_data.iloc[val_index] = val_data

    all_data[col + '_enc'].fillna(0.3343, inplace=True)
    
all_data = downcast_dtypes(all_data)

Prepare to drop the non-encoded categorical variables during training.

In [22]:
cols_to_drop = cols_to_drop + cat_cols

# Display the resulting dataframe

All columns:

In [23]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,revenue,revenue_shop,revenue_item,target_normalized,...,target_normalized_lag_12,target_shop_normalized_lag_12,target_item_normalized_lag_12,revenue_normalized_lag_12,revenue_shop_normalized_lag_12,revenue_item_normalized_lag_12,item_category_id,shop_id_enc,item_id_enc,item_category_id_enc
0,54,10297,12,4.0,8198.0,23.0,2996.0,5272898.5,17227.0,0.011429,...,0.0,0.0,0.0,0.0,0.0,0.0,37,0.828419,0.074755,0.162759
1,54,10296,12,3.0,8198.0,17.0,4797.0,5272898.5,26684.0,0.010989,...,0.0,0.0,0.0,0.0,0.0,0.0,38,0.828419,0.09863,0.169217
2,54,10298,12,14.0,8198.0,182.0,5586.0,5272898.5,71948.0,0.015824,...,0.0,0.0,0.0,0.0,0.0,0.0,40,0.828419,1.431898,0.241341
3,54,10300,12,3.0,8198.0,26.0,2097.0,5272898.5,18174.0,0.010989,...,0.0,0.0,0.0,0.0,0.0,0.0,37,0.828419,0.213953,0.162759
4,54,10284,12,1.0,8198.0,3.0,299.0,5272898.5,897.0,0.01011,...,0.0,0.0,0.0,0.0,0.0,0.0,57,0.828419,0.062338,0.092227


Columns to be passed into the model:

In [24]:
all_data.drop(cols_to_drop, axis=1).head()

Unnamed: 0,date_block_num,target,target_normalized_lag_1,target_shop_normalized_lag_1,target_item_normalized_lag_1,revenue_normalized_lag_1,revenue_shop_normalized_lag_1,revenue_item_normalized_lag_1,target_normalized_lag_2,target_shop_normalized_lag_2,...,revenue_item_normalized_lag_5,target_normalized_lag_12,target_shop_normalized_lag_12,target_item_normalized_lag_12,revenue_normalized_lag_12,revenue_shop_normalized_lag_12,revenue_item_normalized_lag_12,shop_id_enc,item_id_enc,item_category_id_enc
0,12,4.0,0.010989,0.616214,0.005088,0.005721,0.622419,0.001766,0.00967,0.488939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828419,0.074755,0.162759
1,12,3.0,0.00967,0.616214,0.003657,0.005365,0.622419,0.001921,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828419,0.09863,0.169217
2,12,14.0,0.018901,0.616214,0.031084,0.006695,0.622419,0.004218,0.061978,0.488939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828419,1.431898,0.241341
3,12,3.0,0.01011,0.616214,0.006042,0.005476,0.622419,0.001887,0.023297,0.488939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828419,0.213953,0.162759
4,12,1.0,0.00967,0.616214,0.002067,0.005365,0.622419,0.001119,0.00967,0.488939,...,0.00116,0.0,0.0,0.0,0.0,0.0,0.0,0.828419,0.062338,0.092227


# Fit models on the training set

## Gradient boosted decision tree

In [25]:
mlflow.autolog()

2021/07/01 14:51:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2021/07/01 14:51:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2021/07/01 14:51:10 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.


In [26]:
x_cols = [col for col in all_data.columns if col != 'target' and col not in cols_to_drop]

# Clip target values into the [0, 20] range, as in the test set
label = all_data['target'].clip(lower=0, upper=20)

xgtrain = xgb.DMatrix(all_data[x_cols], label=label)

In [27]:
param = {}
num_round = 10
evallist = [(xgtrain, 'train')]

with mlflow.start_run() as run:
    bst = xgb.train(param, xgtrain, num_round, evallist)

[0]	train-rmse:1.06063
[1]	train-rmse:0.98035
[2]	train-rmse:0.93424
[3]	train-rmse:0.90641
[4]	train-rmse:0.88960
[5]	train-rmse:0.87762
[6]	train-rmse:0.86981
[7]	train-rmse:0.86427
[8]	train-rmse:0.85936
[9]	train-rmse:0.85554




# Make predictions on the test set

## Add features to the test dataframe

In [28]:
test_feats = test.copy()

The test data appears in the 34th month, which we add as the value for 'date_block_num'.

In [29]:
test_feats['date_block_num'] = 34

Add features using lags from [1, 2, 3, 4, 5, 12] months ago.

In [30]:
shift_range = [1, 2, 3, 4, 5, 12]

id_cols = ['shop_id', 'item_id']
# All normalized target and revenue columns 
cols_to_rename = [col + '_normalized' for col in num_cols]

for month_shift in tqdm_notebook(shift_range):
    lagged_feats = all_data[all_data['date_block_num'] == 34 - month_shift][['shop_id', 'item_id'] + cols_to_rename]
    test_feats = pd.merge(test_feats, lagged_feats, left_on=id_cols, right_on=id_cols, how='left').fillna(0)
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    test_feats = test_feats.rename(columns=foo)

  0%|          | 0/6 [00:00<?, ?it/s]

Add mean encodings of categorical features.

In [31]:
# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

test_feats = pd.merge(test_feats, item_category_mapping, how='left', on='item_id')

# Mean encodings
global_mean = all_data['target'].mean()
for col in cat_cols:
    gb = all_data.groupby(by=col)['target'].mean().rename(col + '_enc')
    test_feats = test_feats.join(gb, on=col, how='left').fillna(global_mean)

Downcast types to save memory.

In [32]:
test_feats = downcast_dtypes(test_feats)

Set indices to drop when making predictions and display the resulting dataframe.

In [33]:
test_feats.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,target_normalized_lag_1,target_shop_normalized_lag_1,target_item_normalized_lag_1,revenue_normalized_lag_1,revenue_shop_normalized_lag_1,revenue_item_normalized_lag_1,...,target_normalized_lag_12,target_shop_normalized_lag_12,target_item_normalized_lag_12,revenue_normalized_lag_12,revenue_shop_normalized_lag_12,revenue_item_normalized_lag_12,item_category_id,shop_id_enc,item_id_enc,item_category_id_enc
0,0,5,5037,34,0.00967,0.064526,0.003736,0.005365,0.070515,0.001902,...,0.01011,0.088608,0.006916,0.005777,0.111879,0.004687,19,0.207204,1.950845,0.600482
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,55,0.207204,0.324582,0.224931
2,2,5,5233,34,0.01011,0.064526,0.005088,0.005555,0.070515,0.002179,...,0.0,0.0,0.0,0.0,0.0,0.0,19,0.207204,1.656863,0.600482
3,3,5,5232,34,0.00967,0.064526,0.003975,0.005365,0.070515,0.001812,...,0.0,0.0,0.0,0.0,0.0,0.0,23,0.207204,1.093023,0.61083
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20,0.207204,0.324582,1.837239


In [34]:
cols_to_drop = ['ID'] + cat_cols
test_feats.drop(cols_to_drop, axis=1).head()

Unnamed: 0,date_block_num,target_normalized_lag_1,target_shop_normalized_lag_1,target_item_normalized_lag_1,revenue_normalized_lag_1,revenue_shop_normalized_lag_1,revenue_item_normalized_lag_1,target_normalized_lag_2,target_shop_normalized_lag_2,target_item_normalized_lag_2,...,revenue_item_normalized_lag_5,target_normalized_lag_12,target_shop_normalized_lag_12,target_item_normalized_lag_12,revenue_normalized_lag_12,revenue_shop_normalized_lag_12,revenue_item_normalized_lag_12,shop_id_enc,item_id_enc,item_category_id_enc
0,34,0.00967,0.064526,0.003736,0.005365,0.070515,0.001902,0.01011,0.066977,0.010494,...,0.004119,0.01011,0.088608,0.006916,0.005777,0.111879,0.004687,0.207204,1.950845,0.600482
1,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207204,0.324582,0.224931
2,34,0.01011,0.064526,0.005088,0.005555,0.070515,0.002179,0.010989,0.066977,0.008109,...,0.002749,0.0,0.0,0.0,0.0,0.0,0.0,0.207204,1.656863,0.600482
3,34,0.00967,0.064526,0.003975,0.005365,0.070515,0.001812,0.00967,0.066977,0.005565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207204,1.093023,0.61083
4,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207204,0.324582,1.837239


## Gradient boosted decision tree

Make predictions.

In [35]:
xgtest = xgb.DMatrix(test_feats.drop(cols_to_drop, axis=1))
xgpred = bst.predict(xgtest)
# Clip predictions to [0, 20]
np.clip(xgpred, 0, 20)

array([0.5442384 , 0.25536716, 0.78171617, ..., 0.06386901, 0.03684777,
       0.07575982], dtype=float32)

## Write predictions to csv

In [36]:
submission = pd.Series(data=xgpred, name='item_cnt_month')
submission.to_csv(os.path.join(DATA_FOLDER, 'submission.csv'), index_label='ID')