# Set up packages and dataframes

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF

import os
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import re
import regex
from itertools import product
import pandas as pd

spark = SparkSession.builder.getOrCreate()
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [2]:
DATA_FOLDER = 'data/'

transactions = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'sales_train.csv'), 
    )

items = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'items.csv'), 
    )

item_categories = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'item_categories.csv'), 
    )

shops = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'shops.csv'), 
    )

test = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'test.csv'), 
    )

# EDA

## Look at dataframes

Print the top 10 rows and the total number of rows.

In [3]:
print('Total number of rows: {}'.format(transactions.count()))
transactions.show(10)

print('Total number of rows: {}'.format(items.count()))
items.show(10)

print('Total number of rows: {}'.format(item_categories.count()))
item_categories.show(10)

print('Total number of rows: {}'.format(shops.count()))
shops.show(10)

print('Total number of rows: {}'.format(test.count()))
test.show(10)

Total number of rows: 2935849
+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|02.01.2013|             0|     59|  22154|     999.0|         1.0|
|03.01.2013|             0|     25|   2552|     899.0|         1.0|
|05.01.2013|             0|     25|   2552|     899.0|        -1.0|
|06.01.2013|             0|     25|   2554|   1709.05|         1.0|
|15.01.2013|             0|     25|   2555|    1099.0|         1.0|
|10.01.2013|             0|     25|   2564|     349.0|         1.0|
|02.01.2013|             0|     25|   2565|     549.0|         1.0|
|04.01.2013|             0|     25|   2572|     239.0|         1.0|
|11.01.2013|             0|     25|   2572|     299.0|         1.0|
|03.01.2013|             0|     25|   2573|     299.0|         3.0|
+----------+--------------+-------+-------+----------+------------+
only showing top 1

We see that in the training dataset, there are 2935849 transactions, 22170 items, 84 item categories and 60 shops.

# Feature Extraction

## Join all data onto the transactions dataframe 

Upon inspection, one sees that all features appearing in the training dataframes above can be joined onto the transactions dataframe using the appropriate ID.

In [4]:
transactions.createOrReplaceTempView('transactions')
items.createOrReplaceTempView('items')

transactions = spark.sql(('SELECT transactions.*, items.item_name, items.item_category_id '
                  ' FROM transactions '
                  ' LEFT JOIN items '
                  '  ON transactions.item_id = items.item_id '))

In [5]:
transactions.createOrReplaceTempView('transactions')
item_categories.createOrReplaceTempView('item_categories')

transactions = spark.sql(('SELECT transactions.*, item_categories.item_category_name '
                  ' FROM transactions '
                  ' LEFT JOIN item_categories '
                  '  ON transactions.item_category_id = item_categories.item_category_id '))

In [6]:
transactions.createOrReplaceTempView('transactions')
shops.createOrReplaceTempView('shops')

transactions = spark.sql(('SELECT transactions.*, shops.shop_name '
                  ' FROM transactions '
                  ' LEFT JOIN shops '
                  '  ON transactions.shop_id = shops.shop_id '))

## Extract the day, month and year from the date.

In [7]:
transactions = transactions.withColumn('date', F.to_date(transactions.date, format='dd.MM.yyyy'))

In [8]:
transactions = transactions.withColumn('day', F.dayofyear(transactions.date))
transactions = transactions.withColumn('month', F.month(transactions.date))
transactions = transactions.withColumn('year', F.year(transactions.date))

## Extract text-based features

### Stem the text

Define a stemmer that can handle both Russian and English text using nltk's Snowball Stemmer.

In [9]:
en_stemmer = SnowballStemmer('english')
ru_stemmer = SnowballStemmer('russian')

cyr_regex = regex.compile('\p{Cyrillic}+', regex.UNICODE)
lat_regex = regex.compile('\p{Latin}+', regex.UNICODE)

In [10]:
def clean_text(text):
    """ Removes punctuation from string, unwanted unicode characters, and numbers. Returns in lowercase.
    
    Args:
        text (str): The text to clean.
    
    Returns:
        The cleaned text after filtered by the regex expression and made lowercase.
    
    For more information on the unicode categories used in the regex expression see here:
    https://www.regular-expressions.info/unicode.html#category
    
    >>> clean_text("!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ Can't, - Trademark™ ...「（Punctuation）」42.32 ?")
    cant trademark punctuation
    
    """
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    # remove apostrophes 
    text = text.replace("'", "")
    
    # Define regex unicode Categories and strip from string
    remove = regex.compile('[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}|\p{N}]+', regex.UNICODE)
    text = remove.sub(" ", text).strip()
    
    # make lowercase
    text = text.lower()
    
    return text

def stemmer(text):
    """Identify the words written in Cyrillic and Latin characters in a string,
    and apply a Russian or English stemmer, respectively.
    
    Args:
        text(str): The string whose Cyrillic and Latin text will be stemmed.
    
    Returns:
        A stemmed version of the text.
    """
    if text is None:
        return []
    
    text = clean_text(text)

    words = re.split('\s', text)
    stemmed_word_list = []
    for word in words:
        ru = regex.search(cyr_regex, word)
        en = regex.search(lat_regex, word)
        if ru:
            stemmed_word = ru_stemmer.stem(word)
        elif en:
            stemmed_word = en_stemmer.stem(word)
        else:
            stemmed_word = word
        stemmed_word_list.append(stemmed_word)
    
    return stemmed_word_list

Demonstrate function on sample text from the dataset.

In [11]:
text = '(Кино) - Blu-Ray'

stemmer(text)

['кин', 'blu', 'ray']

Apply stemmer to columns containing text.

In [12]:
udf_stemmer = F.udf(stemmer, ArrayType(StringType(), True))

In [13]:
items = items.withColumn('stemmed_item_name', udf_stemmer(items.item_name))
item_categories = item_categories.withColumn('stemmed_item_category_name', udf_stemmer(item_categories.item_category_name))
shops = shops.withColumn('stemmed_shop_name', udf_stemmer(shops.shop_name))

### Vectorize using TF-IDF

In [14]:
cv = CountVectorizer()
idf = IDF()

In [15]:
cv.setInputCol('stemmed_item_name')
cv.setOutputCol('bow_item_name')
cvmodel = cv.fit(items)
items = cvmodel.transform(items)

idf.setInputCol('bow_item_name')
idf.setOutputCol('tfidf_item_name')
idfmodel = idf.fit(items)
items = idfmodel.transform(items)

In [16]:
cv.setInputCol('stemmed_item_category_name')
cv.setOutputCol('bow_item_category_name')
cvmodel = cv.fit(item_categories)
item_categories = cvmodel.transform(item_categories)

idf.setInputCol('bow_item_category_name')
idf.setOutputCol('tfidf_item_category_name')
idfmodel = idf.fit(item_categories)
item_categories = idfmodel.transform(item_categories)

In [17]:
cv.setInputCol('stemmed_shop_name')
cv.setOutputCol('bow_shop_name')
cvmodel = cv.fit(shops)
shops = cvmodel.transform(shops)

idf.setInputCol('bow_shop_name')
idf.setOutputCol('tfidf_shop_name')
idfmodel = idf.fit(shops)
shops = idfmodel.transform(shops)

## Create the training dataframe

Create a base training dataframe of total monthly sales aggregated by month, item id and shop id.

In [21]:
transactions.createOrReplaceTempView('transactions')

index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
train = []
date_block_nums = transactions.select('date_block_num').distinct().toPandas()['date_block_num'].unique()
for block_num in date_block_nums:
    cur_shops = spark.sql("""SELECT DISTINCT shop_id 
                                FROM transactions
                                WHERE date_block_num = {block_num}""".format(block_num = block_num)
                            ).toPandas()['shop_id'].unique()
    cur_items = spark.sql("""SELECT DISTINCT item_id 
                                FROM transactions
                                WHERE date_block_num = {block_num}""".format(block_num = block_num)
                            ).toPandas()['item_id'].unique()
    train.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
train = pd.DataFrame(np.vstack(train), columns = index_cols, dtype=np.int32)
train = spark.createDataFrame(train)

In [22]:
# Groupby data to get shop-item-month aggregates
gb = transactions.select('item_cnt_day', *index_cols).groupby(index_cols).sum('item_cnt_day') \
        .withColumnRenamed('sum(item_cnt_day)', 'target')
# Join it to the grid
train = train.join(gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = transactions.groupby('shop_id', 'date_block_num').sum('item_cnt_day') \
        .withColumnRenamed('sum(item_cnt_day)', 'target_shop')
# Join it to the grid
train = train.join(gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = transactions.groupby('item_id', 'date_block_num').sum('item_cnt_day') \
        .withColumnRenamed('sum(item_cnt_day)', 'target_item')
# Join it to the grid
train = train.join(gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

Create new features using lags from [1, 2, 3, 4, 5, 12] months ago.

In [25]:
cols_to_rename = [col for col in train.columns if col not in index_cols]

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in shift_range:
    train_shift = train.select(*(index_cols + cols_to_rename))
    
    train_shift.withColumn('date_block_num', train_shift.date_block_num + 1)
    
    for col in cols_to_rename:
        train_shift = train_shift.withColumnRenamed(col, '{}_lag_{}'.format(col, month_shift))
        
    train = train.join(train_shift, on=index_cols, how='left').fillna(0)

Next, join TFIDF-encoded item/item category/shop names onto this dataframe.

In [27]:
train.createOrReplaceTempView('train')
items.createOrReplaceTempView('items')

train = spark.sql(('SELECT train.*, items.item_category_id, items.tfidf_item_name '
                  ' FROM train '
                  ' LEFT JOIN items '
                  '  ON train.item_id = items.item_id '))

In [28]:
train.createOrReplaceTempView('train')
item_categories.createOrReplaceTempView('item_categories')

train = spark.sql(('SELECT train.*, item_categories.tfidf_item_category_name '
                  ' FROM train '
                  ' LEFT JOIN item_categories '
                  '  ON train.item_category_id = item_categories.item_category_id '))

In [29]:
train.createOrReplaceTempView('train')
shops.createOrReplaceTempView('shops')

train = spark.sql(('SELECT train.*, shops.tfidf_shop_name '
                  ' FROM train '
                  ' LEFT JOIN shops '
                  '  ON train.shop_id = shops.shop_id '))

## Display the resulting dataframe

In [30]:
train.limit(10).toPandas()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_shop_lag_1,target_item_lag_1,target_lag_2,...,target_lag_5,target_shop_lag_5,target_item_lag_5,target_lag_12,target_shop_lag_12,target_item_lag_12,item_category_id,tfidf_item_name,tfidf_item_category_name,tfidf_shop_name
0,0,491,1,1.0,6127.0,33.0,1.0,6127.0,33.0,1.0,...,1.0,6127.0,33.0,1.0,6127.0,33.0,73,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.72240560...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
1,0,1119,0,1.0,5578.0,6.0,1.0,5578.0,6.0,1.0,...,1.0,5578.0,6.0,1.0,5578.0,6.0,55,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.6337940411878...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.363...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
2,0,1223,1,0.0,6127.0,2.0,0.0,6127.0,2.0,0.0,...,0.0,6127.0,2.0,0.0,6127.0,2.0,59,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.363...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
3,0,1409,1,1.0,6127.0,10.0,1.0,6127.0,10.0,1.0,...,1.0,6127.0,10.0,1.0,6127.0,10.0,19,"(0.0, 0.0, 0.0, 2.4036399453874546, 0.0, 0.0, ...","(1.5522794985941517, 0.0, 0.0, 0.0, 2.14006616...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
4,0,2441,1,4.0,6127.0,98.0,4.0,6127.0,98.0,4.0,...,4.0,6127.0,98.0,4.0,6127.0,98.0,23,"(1.8578055986550412, 0.0, 0.0, 2.4036399453874...","(1.5522794985941517, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
5,0,2822,1,1.0,6127.0,12.0,1.0,6127.0,12.0,1.0,...,1.0,6127.0,12.0,1.0,6127.0,12.0,30,"(1.8578055986550412, 2.1152096499303217, 0.0, ...","(1.5522794985941517, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
6,0,3007,0,4.0,5578.0,122.0,4.0,5578.0,122.0,4.0,...,4.0,5578.0,122.0,4.0,5578.0,122.0,75,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
7,0,3528,0,1.0,5578.0,22.0,1.0,5578.0,22.0,1.0,...,1.0,5578.0,22.0,1.0,5578.0,22.0,75,"(1.8578055986550412, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
8,0,4328,0,2.0,5578.0,11.0,2.0,5578.0,11.0,2.0,...,2.0,5578.0,11.0,2.0,5578.0,11.0,40,"(0.0, 0.0, 0.0, 0.0, 0.0, 2.4835994895198397, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
9,0,4572,0,0.0,5578.0,4.0,0.0,5578.0,4.0,0.0,...,0.0,5578.0,4.0,0.0,5578.0,4.0,55,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.363...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.5014359517392..."
