In [93]:
import sklearn 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import random
from sklearn.ensemble import GradientBoostingClassifier as gbc

In [2]:
items = pd.read_csv('mch_categories.tsv', sep='\t')
items.head(10)

Unnamed: 0,code,name
0,M02,Produce
1,M0227,Produce
2,M022701,Fruit
3,M02270101,Apples
4,M02270102,Bananas
5,M02270103,Berries/Cherries
6,M02270104,Citrus
7,M02270105,Grapes
8,M02270106,Melons
9,M02270107,Pears


In [3]:
hierarchy_dict = {}
for code in items['code']:
    if len(code) not in hierarchy_dict:
        hierarchy_dict[len(code)] = []
    hierarchy_dict[len(code)].append(code)
hierarchy_dict.keys()

dict_keys([3, 5, 7, 9])

In [4]:
for length, keys in hierarchy_dict.items():
    for key in keys:
        items[key] = items['code'].apply(lambda row: int(row[:length] == key))
items.dropna()

In [278]:
dict_3, dict_5, dict_7 = {}, {}, {}
i,j,k = 1,1,1
for key in hierarchy_dict[3]:
    dict_3[key] = i
    i += 1
for key in hierarchy_dict[5]:
    dict_5[key] = j
    j += 1
for key in hierarchy_dict[3]:
    dict_7[key] = k
    k += 1

In [12]:
products = pd.read_csv('products.txt', sep='\t', header=None)
transactions = pd.read_json('transactions.txt', lines=True)

In [302]:
items['name'].nunique()

791

In [91]:
def reciept2items(reciept_list, id_exchange):
    items = {}
    for ix, item_list in enumerate(reciept_list['itemList']):
        items[ix] = []
        for item_dict in item_list:
            item_df = item_dict['item']
            item_name = (id_exchange[id_exchange[0] == item_df][2].values[0])
            item_id = (id_exchange[id_exchange[0] == item_df][1].values[0])
            if item_name != 'Plastic Bags': items[ix].append(item_id)
        if ix > 1000: break
    return items

In [92]:
item_dict = reciept2items(transactions, products)

In [213]:
def load_data(items, item_dict, batch_size=50, maxlen=8):
    train, test = [], []
    while True:
        for reciept in list(item_dict.values()):
            result = items[items['code'].isin(reciept)].loc[:,'M02':].to_numpy()[::-1]
            if result.shape[0] < maxlen+1: continue
            train.append(result[:maxlen])
            test.append(result[1:maxlen+1])
            if len(train)%batch_size == 0: 
                yield np.stack(train, axis=0), np.stack(test, axis=0)
                train, test = [], []
m = load_data(items, item_dict, 50)
next(m)[0].shape

(50, 8, 916)

In [309]:
def load_data_emb(item_dict, batch_size=8, maxlen=8):
    while True:
        i = 0
        res_1, res_2, res_3 = [], [], []
        for lst in list(item_dict.values()):
            if len(lst) < maxlen+1:
                continue
            else:
                lst = lst[:maxlen]
            r = np.zeros(len(lst))
            for ix, itm in enumerate(lst):
                for pos, key in enumerate(dict_3.keys()):
                    if key == itm[:3]:
                        r[ix] = dict_3[key]
            res_1.append(r)
            r = np.zeros(len(lst))
            for ix, itm in enumerate(lst):
                for pos, key in enumerate(dict_5.keys()):
                    if key == itm[:5]:
                        r[ix] = dict_5[key]
            res_2.append(r)
            for ix, itm in enumerate(lst):
                for pos, key in enumerate(dict_7.keys()):
                    if key == itm[:7]:
                        r[ix] = dict_7[key]
            res_3.append(r)
            
            if i%batch_size == 0 and i:
                res_1 = np.stack(res_1, axis=0)
                res_2 = np.stack(res_2, axis=0)
                res_3 = np.stack(res_3, axis=0)
                yield (res_1[:maxlen], res_1[1:]), (res_2[:maxlen], res_2[1:]), (res_3[:maxlen], res_3[1:])
                res_1, res_2, res_3 = [], [], []
            i += 1
m = load_data_emb(item_dict)
next(m)

((array([[ 9.,  9.,  9.,  1.,  9.,  9.,  2.,  9.],
         [ 9.,  9.,  9.,  4.,  9.,  9.,  3.,  1.],
         [ 9.,  9.,  9.,  6.,  4.,  9.,  9.,  9.],
         [ 1.,  9.,  1.,  1.,  9.,  1.,  9.,  9.],
         [ 9.,  9.,  9.,  2.,  1.,  9.,  9.,  9.],
         [ 1.,  1.,  3.,  1.,  9.,  1.,  1.,  9.],
         [ 1.,  9., 15.,  1.,  9.,  9.,  9.,  9.],
         [ 9.,  1.,  9.,  9.,  9.,  9.,  1.,  1.]]),
  array([[ 9.,  9.,  9.,  4.,  9.,  9.,  3.,  1.],
         [ 9.,  9.,  9.,  6.,  4.,  9.,  9.,  9.],
         [ 1.,  9.,  1.,  1.,  9.,  1.,  9.,  9.],
         [ 9.,  9.,  9.,  2.,  1.,  9.,  9.,  9.],
         [ 1.,  1.,  3.,  1.,  9.,  1.,  1.,  9.],
         [ 1.,  9., 15.,  1.,  9.,  9.,  9.,  9.],
         [ 9.,  1.,  9.,  9.,  9.,  9.,  1.,  1.],
         [ 1.,  1.,  1.,  9.,  1.,  9.,  1.,  9.]])),
 (array([[21., 21., 22.,  1., 22., 21.,  3., 21.],
         [24., 21., 21.,  7., 24., 22.,  6.,  2.],
         [22., 22., 21., 14.,  7., 21., 24., 24.],
         [ 1., 22.,  1.,  

In [303]:
def LSTM_model(maxlen=8):
    # Sequential approach to product prediction
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=(maxlen, 916)))
    model.add(tf.keras.layers.LSTM(256, activation='relu', dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
    model.add(tf.keras.layers.LSTM(256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
    model.add(tf.keras.layers.LSTM(256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
    model.add(tf.keras.layers.LSTM(256, activation='relu', recurrent_dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(916, activation='softmax')))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    return model
model = get_model()
model.summary()

Model: "sequential_52"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_64 (LSTM)               (None, 8, 256)            1201152   
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 8, 64)             81984     
_________________________________________________________________
lstm_65 (LSTM)               (None, 8, 256)            328704    
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 8, 64)             81984     
_________________________________________________________________
lstm_66 (LSTM)               (None, 8, 256)            328704    
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 8, 64)             81984     
_________________________________________________________________
lstm_67 (LSTM)               (None, 8, 256)          

In [311]:
model.fit_generator(load_data(items, item_dict, batch_size=4), steps_per_epoch=100, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x208777d2780>