# Setup

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os

# Import specific packages
import re
from collections import Counter
from scipy.sparse import csr_matrix

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Load data

In [2]:
item_cat = pd.read_csv('./data/item_categories.csv')
items = pd.read_csv('./data/items.csv')
sales_train = pd.read_csv('./data/sales_train.csv')
shops = pd.read_csv('./data/shops.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
item_cat.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [4]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [7]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [8]:
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


# Data wrangling

First of all, let's join the `items` and `item_cat` together.

In [9]:
items = pd.merge(items, item_cat, how='left', on=['item_category_id'])

Now that we have the new `items` table, we can join it to the `sales_train` to create a flatten `order_item` table.

In [10]:
order_item = pd.merge(sales_train, items, how='left', on=['item_id'])

In [11]:
order_item.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства


In [12]:
print('num_records in order_item:', len(order_item))
order_item.describe()

num_records in order_item: 2935849


Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641,40.00138
std,9.422988,16.22697,6324.297,1729.8,2.618834,17.10076
min,0.0,0.0,0.0,-1.0,-22.0,0.0
25%,7.0,22.0,4476.0,249.0,1.0,28.0
50%,14.0,31.0,9343.0,399.0,1.0,40.0
75%,23.0,47.0,15684.0,999.0,1.0,55.0
max,33.0,59.0,22169.0,307980.0,2169.0,83.0


As we notice that there are some negative item_price and negative item_cnt_day. We will filter out those records.

In [13]:
order_item = order_item[(order_item['item_price']>0) & (order_item['item_cnt_day']>0)]

In [14]:
print('num_records in order_item:', len(order_item))

num_records in order_item: 2928492


Let's check if the (date, shop_id, item_id) are unique.

In [15]:
a = order_item.groupby(['date', 'shop_id', 'item_id', 'item_category_id'], as_index=False)['item_cnt_day'].count().rename(columns={'item_cnt_day': 'count'}).sort_values(by=['count'], ascending=False)
a.head()

Unnamed: 0,date,shop_id,item_id,item_category_id,count
2674113,29.01.2013,25,12133,62,2
669764,07.10.2015,12,21619,37,2
2778335,30.03.2014,31,16875,37,2
1073110,12.02.2015,42,21619,37,2
1976537,21.11.2014,31,16587,40,2


In [16]:
order_item[(order_item['date']=='29.01.2013') & (order_item['shop_id']==25) & (order_item['item_id']==12133)]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name
6959,29.01.2013,0,25,12133,889.0,1.0,Игровой набор с микророботами Вориорз для поед...,62,"Подарки - Гаджеты, роботы, спорт"
6960,29.01.2013,0,25,12133,1389.0,1.0,Игровой набор с микророботами Вориорз для поед...,62,"Подарки - Гаджеты, роботы, спорт"


Now that we know the same item could be sold at different price at the same day. Therefore there are two records. As in the test dataset, there is no item_price. Hence let's assume that the item_price to be the latest item_price.

In [17]:
item_price = order_item.sort_values(by=['date'], ascending=False).groupby(['item_id', 'shop_id'], as_index=False)['item_price'].first()#.rename(columns={'item_price': 'item_price'})
item_price.head()

Unnamed: 0,item_id,shop_id,item_price
0,0,54,58.0
1,1,55,4490.0
2,2,54,58.0
3,3,54,58.0
4,4,54,58.0


Let's try some stemming! For this to work, we need the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). 

In [18]:
try:
    import nltk
    
    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, '=>', stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


Now let's create a `custom transformer` that we will use to convert sentense to word counters.

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin


class TextToWordCounterTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, 
                 lower_case=True, 
                 remove_punctuation=True, 
                 replace_numbers=True, 
                 stemming=True):
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for text in X:
            if self.lower_case:
                text = text.lower()
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text)
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.stemming and stemmer is not None:
                text = stemmer.stem(text)
            word_counts = Counter(text.split())
            X_transformed.append(word_counts)
        return np.array(X_transformed)

Let's try the transformer on a few item_names.

In [20]:
X_few = np.array(order_item['item_name'][:2])
X_few_wordcounts = TextToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'явление': 1, 'number': 1, 'bd': 1}),
       Counter({'deep': 1, 'purple': 1, 'the': 1, 'house': 1, 'of': 1, 'blue': 1, 'light': 1, 'lp': 1})],
      dtype=object)

This looks quite good.

Now we need to convert them to vectors. For this, let's build a another transformer whose `fit()` method will build the vocabulary and whose `transform()` method will use the vocabulary to convert word counts to vectors. 

In [25]:
class WordCounterToVector(BaseEstimator, TransformerMixin):
    
    def __init__(self, len_vocabulary=100):
        self.len_vocabulary = len_vocabulary
        
    def fit(self, X, y=None):
        vocabulary_counter = Counter({}) 
        for counter in X:
            vocabulary_counter = vocabulary_counter + counter
        vocabulary_counter = vocabulary_counter.most_common(self.len_vocabulary)
        self.vocabulary = {word: index for index, (word, cnt) in enumerate(vocabulary_counter, start=1)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        row = 0
        for row, counter in enumerate(X):
            for (word, cnt) in counter.items():
                rows.append(row)
                cols.append(self.vocabulary.get(word,0))
                data.append(cnt)
        a = csr_matrix((data, (rows, cols)), shape=(len(X), self.len_vocabulary))
        return a

In [27]:
WordCounterToVector().fit_transform(X_few_wordcounts).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)