# ***Please use Kaggle Env to Load this notebook*** 
* https://www.kaggle.com/code/gambitwister/co-visitation-matrix-model
# ***Please add the following dataset:***
* https://www.kaggle.com/datasets/columbia2131/otto-chunk-data-inparquet-format

# Introduction
This notebook introduce how to built three Co-visitation matrix, the click/cart/order to cart/order matrix, the cart/order to cart-order matrix and click/cart/order to click matrix.

One thing to note is that this notebook uses the RAPIDs to speed up the calculation of the matrix.

And the dataset is the chunk data in parquet format instead of the jsonl files. The aim for this is to decrease the size of the dataset and generate a more readable one.

As a result, several ".pqt" files are generated as the output, which represent the 3 matrix.

To see the prediction part, please go to this notebook: https://www.kaggle.com/code/gambitwister/co-visitation-matrix-pred

Let's start :)

In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

#### Use RAPIDs to speed up the Co-visitation Matrix calculation:
#### Remember to use the GPU accelerator.

In [2]:
import cudf
cudf.__version__



'23.06.01'

#### Load the train and test files: 
##### modified from https://www.kaggle.com/code/cdeotte/candidate-rerank-model-lb-0-575/notebook

In [3]:
files = glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*')
type_labels = {'clicks':0, 'carts':1, 'orders':2}
data_cache = {}
for f in files:
    df = pd.read_parquet(f)
    # change the 'ts' from ms to s
    df.ts = (df.ts / 1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    data_cache[f] = df

# INTERVAL means that there will be 5 files calculated at the same time
INTERVAL = 5
# CHUNK represents the amount of the files to process in one step
CHUNK = int(np.ceil(len(files)/6))

def read_file(f):
  return cudf.DataFrame(data_cache[f])

DISK_PIECES = 4
SIZE = 1.86e6/DISK_PIECES
type_weight = {0:1, 1:6, 2:3}

The DISK_PIECES represents the number of disk parts, and SIZE is computed by the total amount of sessions divided by DISK_PIECES.

# click/cart/order to cart/order Co-visitation Matrix:
* this part will generate 4 click_cart_order.pqt files

In [4]:
# there are 4 parts of the disks in total
for part in range(DISK_PIECES):
  for step in range(6):
    start_file_idx = step * CHUNK
    end_file_idx = min( (step+1)*CHUNK, len(files) )

    for idx in range(start_file_idx, end_file_idx, INTERVAL):
      # load the first file of this interval into df
      df = [read_file(files[idx])]
      # load the rest files of this interval into df
      for i in range(1, INTERVAL):
        if idx+i < end_file_idx:
          df.append(read_file(files[idx+1]))
      df = cudf.concat(df,ignore_index=True,axis=0)
      # sort the dataframe by the session-index (ascending) and ts (descending)
      df = df.sort_values(['session','ts'],ascending=[True,False])
      df = df.reset_index(drop=True)
      df['n'] = df.groupby('session').cumcount()
      # keep the sessions whose action-amount is less than 20
      df = df.loc[df.n<20].drop('n',axis=1)

      df = df.merge(df, on='session')
      # keeps the sessions whose two actions are within 1 day
      df = df.loc[((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y)]
      # control the aids to compute within the range of [part*SIZE, (part+1)*SIZE]
      df = df.loc[(df.aid_x >= part*SIZE)&(df.aid_x < (part+1)*SIZE)]
      # remove duplicate sessions
      df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])

      # assign type weights
      df['weight'] = df.type_y.map(type_weight)

      df = df[['aid_x','aid_y','weight']]
      df.weight = df.weight.astype('float32')
      df = df.groupby(['aid_x', 'aid_y']).weight.sum()
      if idx==start_file_idx:
        temp1 = df
      else:
        temp1 = temp1.add(df, fill_value=0)
    # when the above for-loop breaks, temp1 stores 5 file df of this INTERVAL
    # we need to store every temp1 into a new temp, and there are 6 temp1 in total
    if start_file_idx == 0:
      temp2 = temp1
    else:
      temp2 = temp2.add(temp1, fill_value=0)
    del temp1, df
    gc.collect()

  # convert matrix to dictionary
  temp2 = temp2.reset_index()
  # sort the sessions based on the ascending aid and descending weight
  temp2 = temp2.sort_values(['aid_x', 'weight'], ascending=[True, False])
  temp2 = temp2.reset_index(drop=True)
  # save the seesions whose action-amount is less than 15
  temp2['n'] = temp2.groupby('aid_x').aid_y.cumcount()
  temp2 = temp2.loc[temp2.n < 15].drop('n', axis=1)
  # save matrix to the file carts_orders_{part}.pqt
  temp2.to_pandas().to_parquet(f'clicks_carts_orders_{part}.pqt')

# cart/order to cart/order Co-visitation matrix:
* this part will generate one cart_order.pqt file

In [5]:
DISK_PIECES = 1
SIZE = 1.86e6/DISK_PIECES

# there is one part of the disks in total
for part in range(DISK_PIECES):
  for step in range(6):
    start_file_idx = step * CHUNK
    end_file_idx = min( (step+1)*CHUNK, len(files) )

    for idx in range(start_file_idx, end_file_idx, INTERVAL):
      # load the first file of this interval into df
      df = [read_file(files[idx])]
      # load the rest files of this interval into df
      for i in range(1, INTERVAL):
        if idx+i < end_file_idx:
          df.append(read_file(files[idx+1]))
      df = cudf.concat(df,ignore_index=True,axis=0)
      # only keep the "cart" & "order" type
      df = df.loc[df['type'].isin([1,2])]
      # sort the dataframe by the session-index (ascending) and ts (descending)
      df = df.sort_values(['session','ts'],ascending=[True,False])
      df = df.reset_index(drop=True)
      df['n'] = df.groupby('session').cumcount()
      # keep the sessions whose action-amount is less than 20
      df = df.loc[df.n<20].drop('n',axis=1)

      df = df.merge(df, on='session')
      # keeps the sessions whose two actions are within 14 day
      df = df.loc[((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)]
      # control the aids to compute within the range of [part*SIZE, (part+1)*SIZE]
      df = df.loc[(df.aid_x >= part*SIZE)&(df.aid_x < (part+1)*SIZE)]
      # remove duplicate sessions
      df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])

      # assign type weights
      df['weight'] = 1

      df = df[['aid_x','aid_y','weight']]
      df.weight = df.weight.astype('float32')
      df = df.groupby(['aid_x', 'aid_y']).weight.sum()
      if idx==start_file_idx:
        temp1 = df
      else:
        temp1 = temp1.add(df, fill_value=0)
    # when the above for-loop breaks, temp1 stores 5 file df of this INTERVAL
    # we need to store every temp1 into a new temp, and there are 6 temp1 in total
    if start_file_idx == 0:
      temp2 = temp1
    else:
      temp2 = temp2.add(temp1, fill_value=0)
    del temp1, df
    gc.collect()

  # convert matrix to dictionary
  temp2 = temp2.reset_index()
  # sort the sessions based on the ascending aid and descending weight
  temp2 = temp2.sort_values(['aid_x', 'weight'], ascending=[True, False])
  temp2 = temp2.reset_index(drop=True)
  # save the seesions whose action-amount is less than 15
  temp2['n'] = temp2.groupby('aid_x').aid_y.cumcount()
  temp2 = temp2.loc[temp2.n < 15].drop('n', axis=1)
  # save matrix to the file carts_orders_{part}.pqt
  temp2.to_pandas().to_parquet(f'carts_orders_{part}.pqt')

# click/cart/order to click Co-visitation matrix:
* thie part will generate 4 click.pqt files

In [6]:
DISK_PIECES = 4
SIZE = 1.86e6/DISK_PIECES

# there are 4 parts of the disks in total
for part in range(DISK_PIECES):
  for step in range(6):
    start_file_idx = step * CHUNK
    end_file_idx = min( (step+1)*CHUNK, len(files) )

    for idx in range(start_file_idx, end_file_idx, INTERVAL):
      # load the first file of this interval into df
      df = [read_file(files[idx])]
      # load the rest files of this interval into df
      for i in range(1, INTERVAL):
        if idx+i < end_file_idx:
          df.append(read_file(files[idx+1]))
      df = cudf.concat(df,ignore_index=True,axis=0)
      # sort the dataframe by the session-index (ascending) and ts (descending)
      df = df.sort_values(['session','ts'],ascending=[True,False])
      df = df.reset_index(drop=True)
      df['n'] = df.groupby('session').cumcount()
      # keep the sessions whose action-amount is less than 20
      df = df.loc[df.n<20].drop('n',axis=1)

      df = df.merge(df, on='session')
      # keeps the sessions whose two actions are within 1 day
      df = df.loc[((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y)]
      # control the aids to compute within the range of [part*SIZE, (part+1)*SIZE]
      df = df.loc[(df.aid_x >= part*SIZE)&(df.aid_x < (part+1)*SIZE)]
      # remove duplicate sessions
      df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])

      # assign type weights
      df['weight'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)

      df = df[['aid_x','aid_y','weight']]
      df.weight = df.weight.astype('float32')
      df = df.groupby(['aid_x', 'aid_y']).weight.sum()
      if idx==start_file_idx:
        temp1 = df
      else:
        temp1 = temp1.add(df, fill_value=0)
    # when the above for-loop breaks, temp1 stores 5 file df of this INTERVAL
    # we need to store every temp1 into a new temp, and there are 6 temp1 in total
    if start_file_idx == 0:
      temp2 = temp1
    else:
      temp2 = temp2.add(temp1, fill_value=0)
    del temp1, df
    gc.collect()

  # convert matrix to dictionary
  temp2 = temp2.reset_index()
  # sort the sessions based on the ascending aid and descending weight
  temp2 = temp2.sort_values(['aid_x', 'weight'], ascending=[True, False])
  temp2 = temp2.reset_index(drop=True)
  # save the seesions whose action-amount is less than 15
  temp2['n'] = temp2.groupby('aid_x').aid_y.cumcount()
  temp2 = temp2.loc[temp2.n < 20].drop('n', axis=1)
  # save matrix to the file carts_orders_{part}.pqt
  temp2.to_pandas().to_parquet(f'clicks_{part}.pqt')

In [8]:
del data_cache, temp2
gc.collect()

52

# Finally, I've uploaded the output files as a data set in kaggle:
https://www.kaggle.com/datasets/gambitwister/co-visitation-matrix-9417

# Please go to this notebook to see the prediction part:
https://www.kaggle.com/code/gambitwister/co-visitation-matrix-pred