In [1]:
import pandas as pd
import numpy as np
import jsonlines

from scipy.sparse import dok_matrix, csr_matrix, lil_matrix

In [2]:
MCH = pd.read_csv('mch_categories.tsv', sep="\t")
MCH.head()

Unnamed: 0,code,name
0,M02,Produce
1,M0227,Produce
2,M022701,Fruit
3,M02270101,Apples
4,M02270102,Bananas


In [3]:
MCH7 = pd.DataFrame(columns=['category_code', 'category_name'])
row = 0

for i in MCH.index:
    if len(MCH.at[i, 'code']) == 7:
        MCH7.loc[row] = [MCH.at[i, 'code'], MCH.at[i, 'name'].replace('Z_DNU ', '')]
        row += 1

In [4]:
# they are not all food items though
MCH7.head(10)

Unnamed: 0,category_code,category_name
0,M022701,Fruit
1,M022702,Hard Goods
2,M022703,Vegetables
3,M022705,Other
4,M022801,Salad Bar
5,M033101,Processed
6,M033102,Frozen Boxed Meat
7,M033103,Fresh Beef
8,M033104,Fresh-Lamb/Veal/Sausage
9,M033105,Other


In [5]:
# should label the product names
PRODUCTS = pd.read_csv('products.txt', sep="\t", names=['product_code', 'mch', 'product_name'])

# mapping product code to index, and from index to name/categories
code_to_index = dict(zip(PRODUCTS['product_code'], range(len(PRODUCTS))))
index_to_name = dict(zip(range(len(PRODUCTS)), PRODUCTS['product_name']))
index_to_mch = dict(zip(range(len(PRODUCTS)), PRODUCTS['mch']))

In [6]:
PRODUCTS.head()

Unnamed: 0,product_code,mch,product_name
0,20000002_EA,M10210701,Tuna Chunks in Broth
1,20000005_EA,M02270201,Fresh-Pressed Sweet Apple Cider
2,20000053_EA,M10210901,French Dijon Mustard
3,20000056001_KG,M02270304,Anaheim Peppers
4,20000068_KG,M05350101,Swiss Cheese


In [54]:
# read part of the transactions.txt file
# using only a few demo products
test_product_codes = ['20330687001_EA', '20309646001_EA', '20666167007_EA']

test_list = [] # list of receipts which contain some of our test products
count = 0
with jsonlines.open('transactions.txt') as reader:
    for transaction in reader: # for each row/transaction
        count += 1
        for obj in transaction['itemList']:
            if obj['item'] in test_product_codes: # if i find any one of them from the product list
                test_list.append(transaction['itemList'])
                break

num_transactions = len(test_list)

In [57]:
# co-occurence from test_list receipts
# sparse matrix
conf_matrix = dok_matrix((len(PRODUCTS), len(PRODUCTS)), dtype=np.int16)

In [58]:
for transaction in test_list:
    for i in range(0, len(transaction)):
        key1 = code_to_index[transaction[i]['item']]
        
        for j in range(i+1, len(transaction)):
            key2 = code_to_index[transaction[j]['item']]
        
            conf_matrix[key1, key2] += 1
            conf_matrix[key2, key1] += 1

In [59]:
trans_vec = lil_matrix((1, len(PRODUCTS)), dtype = np.float32)

for transaction in test_list:
    for i in range(0, len(transaction)):
        key = code_to_index[transaction[i]['item']]
        trans_vec[0, key] += 1

In [None]:
# calculate lift matrix
lift_matrix = csr_matrix((len(PRODUCTS), len(PRODUCTS)), dtype=np.float64)

In [None]:
# split the hierarchy categories
# from the most general product type to most specific
hierarchy_dict = {}
for code in MCH['code']:
    if len(code) not in hierarchy_dict:
        hierarchy_dict[len(code)] = []
        hierarchy_dict[len(code)].append(code)
    else:
        hierarchy_dict[len(code)].append(code)

In [None]:
# one-hot encode the hierarchy features for each item
# calculate a cosine similarity as penalty for items belonging to different categories
def category_penalty(i, j, hierarchy, products):
    vec1 = get_row(i, hierarchy, products)
    vec2 = get_row(j, hierarchy, products)

    if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0:
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))
    else:
        return 0
    
def get_row(i, hierarchy, products):
    cols = hierarchy[3]+hierarchy[5]+hierarchy[7]+hierarchy[9]
    row_vec = [0]*len(cols) 

    mch = products.at[i, 'mch']

    id1, id2, id3, id4 = [-1, -1, -1, -1]

    if mch[:3] in hierarchy_dict[3]:
        id1 = hierarchy_dict[3].index(mch[:3])
    
    if mch[:5] in hierarchy_dict[5]:
        id2 = hierarchy_dict[5].index(mch[:5])
        
    if mch[:7] in hierarchy_dict[7]:
        id3 = hierarchy_dict[7].index(mch[:7])
    
    if mch[:] in hierarchy_dict[9]:
        id4 = hierarchy_dict[9].index(mch[:])

    if id1 > -1 and id2 > -1 and id3 > -1 and id4 > -1:
        row_vec[id1] = 1
        row_vec[len(hierarchy_dict[3]) + id2] = 1
        row_vec[len(hierarchy_dict[3])+len(hierarchy_dict[5])+id3] = 1
        row_vec[len(hierarchy_dict[3])+len(hierarchy_dict[5])+len(hierarchy_dict[7])+id4] = 1

    return row_vec

In [None]:
row, col = conf_matrix.nonzero()
print(len(row))

for i, j in zip(row, col):
    if trans_vec[0, i] > 0 and  trans_vec[0, j] > 0:
        lift_matrix[i, j] = conf_matrix[i, j] * category_penalty(i, j, hierarchy_dict, PRODUCTS) / (trans_vec[0, i] * trans_vec[0, j])

1259460


In [None]:
# get some outputs for the demo products
for code in test_product_codes:
    ix = code_to_index[code]
  
    row = conf_matrix.getrow(ix).toarray()
    j = np.where(row==row.max())[1]
  
    print("test product = ", index_to_name[ix])
    print("recommendation = ", index_to_name[j[0]])

In [None]:
# get the top k indices, save this list of list as output for the model
def top_k_idx_sparse(matrix, k):
    top_k_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        k_picks = min(k, ri - le)
        top_k_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -k_picks)[-k_picks:]])
    return top_k_idx