# Set up packages and dataframes

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer

import os
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import re
import regex

spark = SparkSession.builder.getOrCreate()

In [2]:
DATA_FOLDER = 'data/'

transactions = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'sales_train.csv'), 
    )

items = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'items.csv'), 
    )

item_categories = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'item_categories.csv'), 
    )

shops = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'shops.csv'), 
    )

test = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'test.csv'), 
    )

# EDA

## Look at dataframes

Print the top 10 rows and the total number of rows.

In [3]:
print('Total number of rows: {}'.format(transactions.count()))
transactions.show(10)

print('Total number of rows: {}'.format(items.count()))
items.show(10)

print('Total number of rows: {}'.format(item_categories.count()))
item_categories.show(10)

print('Total number of rows: {}'.format(shops.count()))
shops.show(10)

print('Total number of rows: {}'.format(test.count()))
test.show(10)

Total number of rows: 2935849
+----------+--------------+-------+-------+----------+------------+
|      date|date_block_num|shop_id|item_id|item_price|item_cnt_day|
+----------+--------------+-------+-------+----------+------------+
|02.01.2013|             0|     59|  22154|     999.0|         1.0|
|03.01.2013|             0|     25|   2552|     899.0|         1.0|
|05.01.2013|             0|     25|   2552|     899.0|        -1.0|
|06.01.2013|             0|     25|   2554|   1709.05|         1.0|
|15.01.2013|             0|     25|   2555|    1099.0|         1.0|
|10.01.2013|             0|     25|   2564|     349.0|         1.0|
|02.01.2013|             0|     25|   2565|     549.0|         1.0|
|04.01.2013|             0|     25|   2572|     239.0|         1.0|
|11.01.2013|             0|     25|   2572|     299.0|         1.0|
|03.01.2013|             0|     25|   2573|     299.0|         3.0|
+----------+--------------+-------+-------+----------+------------+
only showing top 1

We see that in the training dataset, there are 2935849 transactions, 22170 items, 84 item categories and 60 shops.

# Feature Extraction

## Join all data onto the transactions dataframe 

Upon inspection, one sees that all features appearing in the training dataframes above can be joined onto the transactions dataframe using the appropriate ID.

In [4]:
transactions.createOrReplaceTempView('transactions')
items.createOrReplaceTempView('items')

transactions = spark.sql(('SELECT transactions.*, items.item_name, items.item_category_id '
                  ' FROM transactions '
                  ' LEFT JOIN items '
                  '  ON transactions.item_id = items.item_id '))

In [5]:
transactions.createOrReplaceTempView('transactions')
item_categories.createOrReplaceTempView('item_categories')

transactions = spark.sql(('SELECT transactions.*, item_categories.item_category_name '
                  ' FROM transactions '
                  ' LEFT JOIN item_categories '
                  '  ON transactions.item_category_id = item_categories.item_category_id '))

In [6]:
transactions.createOrReplaceTempView('transactions')
shops.createOrReplaceTempView('shops')

transactions = spark.sql(('SELECT transactions.*, shops.shop_name '
                  ' FROM transactions '
                  ' LEFT JOIN shops '
                  '  ON transactions.shop_id = shops.shop_id '))

## Extract the day, month and year from the date.

In [7]:
transactions = transactions.withColumn('date', F.to_date(transactions.date, format='dd.MM.yyyy'))

In [8]:
transactions = transactions.withColumn('day', F.dayofyear(transactions.date))
transactions = transactions.withColumn('month', F.month(transactions.date))
transactions = transactions.withColumn('year', F.year(transactions.date))

## Extract text-based features

### Stem the text

Define a stemmer that can handle both Russian and English text using nltk's Snowball Stemmer.

In [9]:
en_stemmer = SnowballStemmer('english')
ru_stemmer = SnowballStemmer('russian')

cyr_regex = regex.compile('\p{Cyrillic}+', regex.UNICODE)
lat_regex = regex.compile('\p{Latin}+', regex.UNICODE)

In [10]:
def clean_text(text):
    """ Removes punctuation from string, unwanted unicode characters, and numbers. Returns in lowercase.
    
    Args:
        text (str): The text to clean.
    
    Returns:
        The cleaned text after filtered by the regex expression and made lowercase.
    
    For more information on the unicode categories used in the regex expression see here:
    https://www.regular-expressions.info/unicode.html#category
    
    >>> clean_text("!$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ Can't, - Trademark™ ...「（Punctuation）」42.32 ?")
    cant trademark punctuation
    
    """
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    # remove apostrophes 
    text = text.replace("'", "")
    
    # Define regex unicode Categories and strip from string
    remove = regex.compile('[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}|\p{N}]+', regex.UNICODE)
    text = remove.sub(" ", text).strip()
    
    # make lowercase
    text = text.lower()
    
    return text

def stemmer(text):
    """Identify the words written in Cyrillic and Latin characters in a string,
    and apply a Russian or English stemmer, respectively.
    
    Args:
        text(str): The string whose Cyrillic and Latin text will be stemmed.
    
    Returns:
        A stemmed version of the text.
    """
    if text is None:
        return []
    
    text = clean_text(text)

    words = re.split('\s', text)
    stemmed_word_list = []
    for word in words:
        ru = regex.search(cyr_regex, word)
        en = regex.search(lat_regex, word)
        if ru:
            stemmed_word = ru_stemmer.stem(word)
        elif en:
            stemmed_word = en_stemmer.stem(word)
        else:
            stemmed_word = word
        stemmed_word_list.append(stemmed_word)
    
    return stemmed_word_list

Demonstrate function on sample text from the dataset.

In [11]:
text = '(Кино) - Blu-Ray'

stemmer(text)

['кин', 'blu', 'ray']

Apply stemmer to columns containing text.

In [12]:
udf_stemmer = F.udf(stemmer, ArrayType(StringType(), True))

In [13]:
transactions = transactions.withColumn('stemmed_item_name', udf_stemmer(transactions.item_name))
transactions = transactions.withColumn('stemmed_item_category_name', udf_stemmer(transactions.item_category_name))
transactions = transactions.withColumn('stemmed_shop_name', udf_stemmer(transactions.shop_name))

### Vectorize using bag of words

In [14]:
cv = CountVectorizer()

In [15]:
cols = ['item_name', 'item_category_name', 'shop_name']

for col in cols:
    cv.setInputCol('stemmed_' + col)
    cv.setOutputCol('vect_' + col)

    model = cv.fit(transactions)
    model.setInputCol('stemmed_' + col)
    transactions = model.transform(transactions)

## Display the resulting dataframe.

In [16]:
transactions.limit(10).toPandas()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name,day,month,year,stemmed_item_name,stemmed_item_category_name,stemmed_shop_name,vect_item_name,vect_item_category_name,vect_shop_name
0,2013-01-02,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"""Ярославль ТЦ """"Альтаир""""""",2,1,2013,"[явлен, bd]","[кин, blu, ray]","[ярославл, тц, альтаир]","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2013-01-03,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"""Москва ТРК """"Атриум""""""",3,1,2013,"[deep, purpl, the, hous, of, blue, light, lp]","[музык, вин]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,2013-01-05,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"""Москва ТРК """"Атриум""""""",5,1,2013,"[deep, purpl, the, hous, of, blue, light, lp]","[музык, вин]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,2013-01-06,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"""Москва ТРК """"Атриум""""""",6,1,2013,"[deep, purpl, who, do, you, think, we, are, lp]","[музык, вин]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,2013-01-15,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"""Москва ТРК """"Атриум""""""",15,1,2013,"[deep, purpl, veri, best, of, cd, фирм]","[музык, cd, фирмен, производств]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
5,2013-01-10,0,25,2564,349.0,1.0,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео,"""Москва ТРК """"Атриум""""""",10,1,2013,"[deep, purpl, perihelion, live, in, concert, d...","[музык, музыкальн, виде]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
6,2013-01-02,0,25,2565,549.0,1.0,DEEP PURPLE Stormbringer (фирм.),56,Музыка - CD фирменного производства,"""Москва ТРК """"Атриум""""""",2,1,2013,"[deep, purpl, stormbring, фирм]","[музык, cd, фирмен, производств]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
7,2013-01-04,0,25,2572,239.0,1.0,DEFTONES Koi No Yokan,55,Музыка - CD локального производства,"""Москва ТРК """"Атриум""""""",4,1,2013,"[defton, koi, no, yokan]","[музык, cd, локальн, производств]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
8,2013-01-11,0,25,2572,299.0,1.0,DEFTONES Koi No Yokan,55,Музыка - CD локального производства,"""Москва ТРК """"Атриум""""""",11,1,2013,"[defton, koi, no, yokan]","[музык, cd, локальн, производств]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
9,2013-01-03,0,25,2573,299.0,3.0,DEL REY LANA Born To Die,55,Музыка - CD локального производства,"""Москва ТРК """"Атриум""""""",3,1,2013,"[del, rey, lana, born, to, die]","[музык, cd, локальн, производств]","[москв, трк, атриум]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
