# Import packages and load dataframes

In [1]:
import pandas as pd
import os
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import regex
from itertools import product
import gc
from tqdm.notebook import tqdm as tqdm_notebook
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
DATA_FOLDER = 'data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_cats = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))

# Define some helper functions

In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

# Feature Extraction

## Create the base training dataframe

In [4]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
all_data = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

## Aggregate sales and revenue by month, item id and shop id.

Add item revenue as a feature to the transactions dataframe.

In [5]:
sales['item_revenue'] = sales.item_cnt_day * sales.item_price

Aggregate sales.

In [6]:
# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
# Fix column names
gb.columns = ['target' if col == 'item_cnt_day' else col for col in gb.columns.values]
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.columns = ['target_shop' if col == 'item_cnt_day' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.columns = ['target_item' if col == 'item_cnt_day' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
# del grid, gb 
gc.collect();

Aggregate revenue.

In [7]:
# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_revenue':'sum'})
# Fix column names
gb.columns = ['revenue' if col == 'item_revenue' else col for col in gb.columns.values]
# Join it to the grid
all_data = pd.merge(all_data, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_revenue':'sum'})
gb.columns = ['revenue_shop' if col == 'item_revenue' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_revenue':'sum'})
gb.columns = ['revenue_item' if col == 'item_revenue' else col for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
# del grid, gb 
gc.collect();

## Normalize numerical features

We choose to normalize using a min-max scaler.

In [8]:
# Create a list of numerical columns to normalize
num_cols = [col for col in all_data.columns if col.startswith('target') or col.startswith('revenue')]

# Normalize the columns
for col in num_cols:
    col_min = all_data[col].min()
    col_max = all_data[col].max()
    all_data[col + '_normalized'] = (all_data[col] - col_min) / (col_max - col_min)
    
# Drop unnormalized columns except for 'target'
cols_to_drop = [col for col in num_cols if col != 'target']
all_data.drop(cols_to_drop, axis=1, inplace=True)

## Add values from previous months as features

Create new features using lags from [1, 2, 3, 4, 5, 12] months ago.

In [9]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols + ['target']))

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

  0%|          | 0/6 [00:00<?, ?it/s]

Normalized targets and revenues without a lag are not available as features in the test dataset, so we drop them.

In [10]:
all_data.drop(['target_normalized', 'revenue_normalized'], axis=1, inplace=True)

## Mean encode categorical features

In [11]:
kf = KFold(n_splits=5, shuffle=False)

cat_cols = ['shop_id', 'item_id', 'item_category_id']

for col in cat_cols:
    all_data[col + '_enc'] = np.nan

    for train_index, val_index in kf.split(all_data):
        train_data, val_data = all_data.iloc[train_index].copy(), all_data.iloc[val_index].copy()
        target_mean = train_data.groupby(col).target.mean()
        val_data[col + '_enc'] = val_data[col].map(target_mean)
        all_data.iloc[val_index] = val_data

    all_data[col + '_enc'].fillna(0.3343, inplace=True)

## Extract text-based features

### Stem the text

Define a stemmer that can handle both Russian and English text using nltk's Snowball Stemmer.

In [12]:
en_stemmer = SnowballStemmer('english', ignore_stopwords=True)
ru_stemmer = SnowballStemmer('russian', ignore_stopwords=True)

cyr_regex = regex.compile('\p{Cyrillic}+', regex.UNICODE)
lat_regex = regex.compile('\p{Latin}+', regex.UNICODE)

en_stopwords = stopwords.words("english")
ru_stopwords = stopwords.words("russian")

In [13]:
def clean_text(text):
    """ Removes punctuation from string, unwanted unicode characters, and numbers. Returns in lowercase.
    
    Args:
        text (str): The text to clean.
    
    Returns:
        The cleaned text after filtered by the regex expression and made lowercase.
    
    For more information on the unicode categories used in the regex expression see here:
    https://www.regular-expressions.info/unicode.html#category
    
    >>> clean_text("!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ Can't, - Trademark™ ...「（Punctuation）」42.32 ?")
    cant trademark punctuation
    
    """
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    # remove apostrophes 
    text = text.replace("'", "")
    
    # Define regex unicode Categories and strip from string
    remove = regex.compile('[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}|\p{N}]+', regex.UNICODE)
    text = remove.sub(" ", text).strip()
    
    # make lowercase
    text = text.lower()
    
    return text

Apply clean_text to stopwords to make removal easier in the preprocess_text function below.

In [14]:
cleaned_en_stopwords = [clean_text(word) for word in en_stopwords]
cleaned_ru_stopwords = [clean_text(word) for word in ru_stopwords]

In [15]:
def preprocess_text(text):
    """Identify the words written in Cyrillic and Latin characters in a string,
    remove stop words and apply a Russian or English stemmer, respectively.
    
    Args:
        text(str): The string whose Cyrillic and Latin text will be stemmed.
    
    Returns:
        A stemmed version of the text.
    """
    if text is None:
        return []
    
    text = clean_text(text)

    words = re.split('\s', text)
    stemmed_sentence = ''
    for word in words:
        ru = regex.search(cyr_regex, word)
        en = regex.search(lat_regex, word)
        if ru:
            # drop stopwords from the sentence
            if word in cleaned_ru_stopwords:
                continue
            stemmed_word = ru_stemmer.stem(word)
        elif en:
            # drop stopwords from the sentence
            if word in cleaned_en_stopwords:
                continue
            stemmed_word = en_stemmer.stem(word)
        else:
            stemmed_word = word
        stemmed_sentence = stemmed_sentence + ' ' + stemmed_word
    
    return stemmed_sentence[1:]

Demonstrate function on sample text from the dataset.

In [16]:
text = '(Кино) - Blu-Ray'

preprocess_text(text)

'кин blu ray'

Apply stemmer to columns containing text.

In [17]:
shops['preprocessed_shop_name'] = shops['shop_name'].map(preprocess_text)
items['preprocessed_item_name'] = items['item_name'].map(preprocess_text)
item_cats['preprocessed_item_category_name'] = item_cats['item_category_name'].map(preprocess_text)

### Vectorize using TF-IDF

In [18]:
vectorizer = TfidfVectorizer(min_df=3)

In [19]:
tfidf_shop_names = vectorizer.fit_transform(shops['preprocessed_shop_name'].values)
tfidf_shop_names = pd.DataFrame.sparse.from_spmatrix(tfidf_shop_names)
tfidf_shop_names.columns = ['vect_shop_name_' + str(col) for col in tfidf_shop_names.columns]
shops = shops.join(tfidf_shop_names)

tfidf_item_names = vectorizer.fit_transform(items['preprocessed_item_name'].values)
tfidf_item_names = pd.DataFrame.sparse.from_spmatrix(tfidf_item_names)
tfidf_item_names.columns = ['vect_item_name_' + str(col) for col in tfidf_item_names.columns]
items = items.join(tfidf_item_names)

tfidf_item_cat_names = vectorizer.fit_transform(item_cats['preprocessed_item_category_name'].values)
tfidf_item_cat_names = pd.DataFrame.sparse.from_spmatrix(tfidf_item_cat_names)
tfidf_item_cat_names.columns = ['vect_item_category_name_' + str(col) for col in tfidf_item_cat_names.columns]
item_cats = item_cats.join(tfidf_item_cat_names)

### Join TFIDF-encoded item/item category/shop names onto the training dataframe

In [20]:
vect_shop_cols = [col for col in shops.columns if col.startswith('vect')]
all_data = all_data.join(shops[vect_shop_cols], on='shop_id', how='left')

vect_item_cols = [col for col in items.columns if col.startswith('vect')]
all_data = all_data.join(items[vect_item_cols], on='item_id', how='left')

vect_item_cat_cols = [col for col in item_cats.columns if col.startswith('vect')]
all_data = all_data.join(item_cats[vect_item_cat_cols], on='item_category_id', how='left')

KeyboardInterrupt: 

# Display the resulting dataframe

In [None]:
all_data.head(5)