In [1]:
import pandas as pd
import numpy as np
from time import time
from IPython.display import display, HTML
from scipy.sparse import dok_matrix
from scipy.sparse import save_npz

In [2]:
class SimpleLogger:
    def __init__(self):
        self.last_time = time()
        self.c_time = time()
  
    def VerboseLog(self, _str, show_time = False):
        self.last_time = self.c_time
        self.c_time = time()
        if show_time:
            _str += " [{:.2f}s]".format(self.c_time-self.last_time)
        print(_str, flush = True)
  
    def log(self, _str, show_time = False):
        self.last_time = self.c_time
        self.c_time = time()
        if show_time:
            _str += " [{:.2f}s]".format(self.c_time-self.last_time)    
        print(_str, flush = True)

logger = SimpleLogger()

In [4]:
def write_df(df, fname):
    import os
    if not os.path.isfile(fname):
        df.to_csv(fname)
    else:
        df.to_csv(fname, mode = 'a', header = False)
    
    return

def add_vect_to_mco(mco, vect, window = 2, max_count = 64000):
    DEBUG = False
    for i, prod_id in enumerate(vect):
        nr_left_prods = min(vect[:i].shape[0], window)
        nr_right_prods = min(vect[i+1:].shape[0], window)
        for j in range(nr_left_prods):
            l_ind = i - (j + 1)
            l_prod = vect[l_ind]
            if l_prod != prod_id:
                current_mco_val = mco[prod_id, l_prod]
                mco[prod_id, l_prod] = min(current_mco_val + 1 / (2 ** (j + 1)), max_count)
                if DEBUG:
                    print('MCO[{}][{}] = {}'.format(prod_id, l_prod, mco[prod_id, l_prod]))

        for j in range(nr_right_prods):
            r_ind = i + j + 1
            r_prod = vect[r_ind]
            if r_prod != prod_id:
                current_mco_val = mco[prod_id, r_prod]
                mco[prod_id, r_prod] = min(current_mco_val + 1 / (2 ** (j + 1)), max_count)
                if DEBUG:
                    print('MCO[{}][{}] = {}'.format(prod_id, r_prod, mco[prod_id, r_prod]))

    return mco           

#unique_trans_grouped = None
def process_chunk(df_chunk, tran_field, tran_det_field, mco, prod_name_field):
    #global unique_trans_grouped
    import itertools
    
    start = time()
    unique_trans_grouped = df_chunk.groupby(tran_field).apply(lambda x: [x[prod_name_field].values, x[tran_det_field].values])
    unique_trans_grouped = np.array(unique_trans_grouped)
    logger.log('  Groupby operation finished in {:.2f}s'.format(time() - start))
    
    start = time()
    for i, l in enumerate(unique_trans_grouped):
        real_order_mask = np.argsort(l[1])
        market_basket = l[0][real_order_mask]
        market_basket = np.array([k for k,g in itertools.groupby(market_basket)]) # collapse adjacent identic elements
        if market_basket.shape[0] == 1:
            continue
            
        mco = add_vect_to_mco(mco, market_basket)

    logger.log('  Unique market baskets processed in {:.2f}s'.format(time() - start))
    
    return mco

In [5]:
keep_cols = ['TRAN_ID', 'TRAN_DET_ID', 'SITE_ID', 'TIMESTAMP', 'NEW_ID']
all_cols = ['TRAN_ID', 'TRAN_DET_ID', 'SITE_ID', 'CUST_ID', 'ITEM_ID', 'QTY', 'AMOUNT', 'TIMESTAMP', 'NEW_ID']
chunksize = 10e6
nr_products = 28377

reader = pd.read_csv('full_dataset_transactions.csv',
                     names = all_cols,
                     chunksize = chunksize)

logger.log('Creating mco ...')
mco = np.zeros((nr_products + 1, nr_products + 1), dtype = np.float32)
logger.log('Created mco.', show_time = True)

Creating mco ...
Created mco. [0.02s]


In [6]:
for i, batch in enumerate(reader):
    logger.log('Preprocessing batch {} with {:,} entries...'.format(i+1, batch.shape[0]))
    df = pd.DataFrame(batch[keep_cols])
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])
    df.loc[:, 'FTRAN'] = df['SITE_ID'].astype(str) + df['TRAN_ID'].astype(str)
    df.drop(['TRAN_ID', 'SITE_ID'], axis = 1, inplace = True)
    logger.log('Done preprocessing batch.', show_time = True)
    logger.log('Processing the batch #{} ...'.format(i+1))
    mco = process_chunk(df, 'FTRAN', 'TRAN_DET_ID', mco, 'NEW_ID')
    logger.log('Done processing the batch.')

mco.dump('mco_f32.npy')

Preprocessing batch 1 with 10,000,000 entries...
Done preprocessing batch. [53.67s]
Processing the batch #1 ...
  Groupby operation finished in 372.04s
  Unique market baskets processed in 91.18s
Done processing the batch.
Preprocessing batch 2 with 10,000,000 entries...
Done preprocessing batch. [56.02s]
Processing the batch #2 ...
  Groupby operation finished in 390.13s
  Unique market baskets processed in 92.30s
Done processing the batch.
Preprocessing batch 3 with 10,000,000 entries...
Done preprocessing batch. [56.66s]
Processing the batch #3 ...
  Groupby operation finished in 381.90s
  Unique market baskets processed in 97.57s
Done processing the batch.
Preprocessing batch 4 with 10,000,000 entries...
Done preprocessing batch. [55.09s]
Processing the batch #4 ...
  Groupby operation finished in 386.18s
  Unique market baskets processed in 95.45s
Done processing the batch.
Preprocessing batch 5 with 10,000,000 entries...
Done preprocessing batch. [55.03s]
Processing the batch #5 

<h3> Sanity check </h3>

In [None]:
# DO THESE OPERATIONS WITH chunksize=2e3 and DEBUG set to True in add_vect_to_mco

df_test = pd.DataFrame(reader.get_chunk()[keep_cols])
df_test.loc[:, 'FTRAN'] = df_test['SITE_ID'].astype(str) + df_test['TRAN_ID'].astype(str)
df_test.drop(['TRAN_ID', 'SITE_ID', 'TIMESTAMP'], axis = 1, inplace = True)

logger.log('Start grouping {:,} entries ... '.format(df_test.shape[0]))
unique_trans_grouped = df_test.groupby('FTRAN').apply(lambda x: [x['NEW_ID'].values, x['TRAN_DET_ID'].values])
logger.log('Finished grouping.', show_time = True)



In [None]:
# DO THESE OPERATIONS WITH chunksize=2e3 and DEBUG set to True in add_vect_to_mco

mco = np.zeros((nr_products + 10, nr_products + 10), dtype = np.float16)
tqdm_works=True
import itertools
for i, l in enumerate(unique_trans_grouped):
    real_order_mask = np.argsort(l[1])
    market_basket = l[0][real_order_mask]
    market_basket = np.array([k for k,g in itertools.groupby(market_basket)]) # collapse adjacent identic elements
    if market_basket.shape[0] == 1:
        continue

    print(market_basket)
        
    mco = add_vect_to_mco(mco, market_basket)
    if tqdm_works:
        one_percent = int(unique_trans_grouped.shape[0] * 0.01)
        if i % one_percent == 0:
            logger.log('  Processed {:.2f}%.'.format((i / unique_trans_grouped.shape[0]) * 100))

<h3> Sanity check nr. 2 - for the first 10M chunk </h3>

In [None]:
start = time()
unique_trans_grouped = df.groupby('FTRAN').apply(lambda x: [x['NEW_ID'].values, x['TRAN_DET_ID'].values])
unique_trans_grouped = np.array(unique_trans_grouped)
logger.log('  Groupby operation finished in {:.2f}s'.format(time() - start))

In [None]:
print(mco[1387][397])
print(mco[1387][342])
print(mco[1387][62])
print(mco[1387][2249])
print(mco[1387][183])
print(mco[1387][1300])
print(mco[1387][43])

In [None]:
import itertools
d = []
for i, l in enumerate(unique_trans_grouped):
    real_order_mask = np.argsort(l[1])
    market_basket = l[0][real_order_mask]
    market_basket = np.array([k for k,g in itertools.groupby(market_basket)]) # collapse adjacent identic elements
    if market_basket.shape[0] == 1:
        continue
        
    if (1387 in market_basket) and (187 in market_basket):
        d.append(market_basket)

In [None]:
sc_mco = np.zeros((29000,29000), dtype = np.float16)
for v in d:
    sc_mco = add_vect_to_mco(sc_mco, v)
print(sc_mco[5,12])
print(sc_mco[12,5])

In [None]:
import itertools
d = []
for i, l in enumerate(unique_trans_grouped):
    real_order_mask = np.argsort(l[1])
    market_basket = l[0][real_order_mask]
    market_basket = np.array([k for k,g in itertools.groupby(market_basket)]) # collapse adjacent identic elements
    if market_basket.shape[0] == 1:
        continue
        
    if (1387 in market_basket) and (87 in market_basket):
        d.append(market_basket)

In [None]:
sc_mco = np.zeros((29000,29000))
for v in d:
    sc_mco = add_vect_to_mco(sc_mco, v)
print(sc_mco[2,9])
print(sc_mco[9,2])

In [None]:
d

In [None]:
mco[1387].argsort()[::-1][:20]

In [None]:
mco2 = np.load('mco.npy')

In [None]:
mco2[1387].argsort()[::-1][:20]

In [None]:
mco2[1387,43]

In [10]:
mco[1387].max() / sum(mco[1387])

0.13029197080291971

14385.0