In [None]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

In [1]:
import pandas as pd
import numpy as np
import cudf
import os
from tqdm import tqdm
from pyarrow.parquet import ParquetFile
import pickle

In [2]:
DATA_PATH = '/workspace/coveo_task1_v4'
FOLDS = 5
MAX_SESSION_LENGTH = 30
MAX_LENGTH_FOR_F1 = 20
ROW_GROUP_SIZE = 10000
MAX_QUERIES_PER_SESSION = 10
MAX_QUERY_CLICKS_PER_SESSION = 10
QUERY_VECTOR_DIM = 50

In [3]:
UNFREQUENT_PRODUCT_URL_ID = 1 #Chose an arbitrary unfrequent product_url to replace unfrequent item ids

In [4]:
SEQ_COLS = ['product_url_hash_list', 'has_been_removed_from_cart-list',
       'has_been_added_to_cart-list', 'has_been_purchased-list',
       'has_been_detailed-list', 'event_type-list',
       'product_action_filled-list', 'nb_interactions-list',
       'category_hash-list', 'main_category-list', 'price_bucket-list',
       'mean_price_hierarchy-list', 'mean_price_main-list',
       'timestamp_hour_cos-list',
       'timestamp_hour_sin-list', 'timestamp_wd_sin-list',
       'timestamp_wd_cos-list', 'timestamp_age_days-list',
       'timestamp_age_days_norm-list'] + ['product_url_hash_list_unfreq']

In [5]:
SEARCH_COLS = ['flat_query_vector', 'flat_product_skus_hash', 
               'flat_clicked_skus_hash', 'impressions_size',  'nb_queries', 'clicks_size', 'clicked-flag' ]

In [7]:
# To check the id of product events and page views, which is important info for our preproc and training
event_type_df = pd.read_parquet(os.path.join(DATA_PATH, 'categories/unique.event_type.parquet'))
event_type_df

Unnamed: 0,event_type,event_type_count
0,,0
1,event_product,8351362
2,pageview,14878349
3,search,273792


In [8]:
EVENT_TYPE_PRODUCT = event_type_df[event_type_df['event_type'] == 'event_product'].index[0]
EVENT_TYPE_PRODUCT

1

In [9]:
product_url_mapping = pd.read_parquet(os.path.join(DATA_PATH, 'categories/unique.product_url_hash.parquet'))
product_url_mapping

Unnamed: 0,product_url_hash
0,
1,000030cd61db73a62d77d1bcb125d056cd1597002157a7...
2,000041873101672ca69cca675690d55359edb48a0646d0...
3,000082c13a39a85caafc0b435f37b8cccb3aa3b0c63d56...
4,0000afb33216a9250c473b0947ec868e56b835428a17d5...
...,...
508159,ffff30bd98877992cb2e4b7d2d74bc28c098b009e03ea8...
508160,ffff42b4e6f8f2ce03fef362723669d566f62d93339418...
508161,ffff8470067b395ca714a48f34ac523d32b4aefde1e9a9...
508162,ffffe31ef79fb330d6a6624a13f37dc623172b960aab79...


## Checking categorical features cardinalities

In [11]:
categories_mapping_path = os.path.join(DATA_PATH, 'categories')
for fname in os.listdir(categories_mapping_path):
    df = pd.read_parquet(os.path.join(categories_mapping_path, fname))
    print(fname, df.index.max()+1)

unique.product_action_filled.parquet 7
unique.price_bucket.parquet 11
unique.product_url_hash.parquet 508164
unique.hashed_url.parquet 510101
unique.main_category.parquet 10
mapping_product_sku_without_urls.parquet 60560
unique.session_id_hash.parquet 5266973
unique.event_type.parquet 4
unique.category_hash.parquet 174
unique.product_sku_hash.parquet 59752


## Frequency capping of encoded item ids

In [12]:
dfs = []
for dataset in ['train', 'valid', 'test']:
    for fold in range(1,6):
        path = os.path.join(DATA_PATH, f'{dataset}-{fold}.parquet')
        df = pd.read_parquet(path, columns=['product_url_hash_list'])
        df = df.explode('product_url_hash_list')
        dfs.append(df)
product_urls_concat = pd.concat(dfs, axis=0)        

In [14]:
product_urls_count = product_urls_concat.groupby('product_url_hash_list').size().sort_values(ascending=False)
product_urls_count

product_url_hash_list
134080    1076557
131003     396942
113524     184454
33255      182282
21375      121193
           ...   
271598          1
271599          1
271600          1
475986          1
1               1
Length: 467799, dtype: int64

In [15]:
product_urls_count[UNFREQUENT_PRODUCT_URL_ID]

1

In [16]:
product_urls_count.describe(percentiles=np.arange(0.0,1.02, 0.02))

count    4.677990e+05
mean     4.539761e+01
std      1.820554e+03
min      1.000000e+00
0%       1.000000e+00
2%       1.000000e+00
4%       1.000000e+00
6%       1.000000e+00
8%       1.000000e+00
10%      1.000000e+00
12%      1.000000e+00
14%      1.000000e+00
16%      1.000000e+00
18%      1.000000e+00
20%      1.000000e+00
22%      1.000000e+00
24%      1.000000e+00
26%      1.000000e+00
28%      1.000000e+00
30%      1.000000e+00
32%      1.000000e+00
34%      1.000000e+00
36%      1.000000e+00
38%      1.000000e+00
40%      1.000000e+00
42%      1.000000e+00
44%      1.000000e+00
46%      1.000000e+00
48%      1.000000e+00
50%      1.000000e+00
52%      1.000000e+00
54%      1.000000e+00
56%      1.000000e+00
58.0%    1.000000e+00
60%      2.000000e+00
62%      2.000000e+00
64%      2.000000e+00
66%      2.000000e+00
68%      2.000000e+00
70%      2.000000e+00
72%      3.000000e+00
74%      3.000000e+00
76%      4.000000e+00
78%      4.000000e+00
80%      5.000000e+00
82%      7

In [17]:
product_urls_min_freq_series = product_urls_count[product_urls_count >= 5]
product_urls_min_freq_series

product_url_hash_list
134080    1076557
131003     396942
113524     184454
33255      182282
21375      121193
           ...   
268379          5
493863          5
432394          5
142732          5
142960          5
Length: 101022, dtype: int64

In [18]:
product_urls_min_freq_set = set(product_urls_min_freq_series.index)
product_urls_min_freq_set

{262144,
 2,
 262148,
 7,
 11,
 262156,
 13,
 14,
 12,
 20,
 262169,
 262172,
 262176,
 262178,
 37,
 39,
 40,
 44,
 46,
 262192,
 262194,
 262196,
 54,
 262200,
 262201,
 262202,
 58,
 70,
 262217,
 262221,
 83,
 84,
 89,
 262241,
 262243,
 100,
 262245,
 262252,
 110,
 262258,
 262260,
 262264,
 124,
 262273,
 262275,
 262276,
 262277,
 262278,
 143,
 262296,
 155,
 262300,
 262302,
 159,
 158,
 262304,
 262305,
 262308,
 165,
 166,
 262312,
 262315,
 174,
 262319,
 262318,
 177,
 178,
 262325,
 262328,
 186,
 190,
 262335,
 193,
 195,
 205,
 262356,
 262357,
 262358,
 217,
 262369,
 262370,
 262371,
 231,
 262375,
 262377,
 262378,
 262380,
 262383,
 244,
 262390,
 253,
 256,
 262410,
 262415,
 272,
 273,
 262418,
 262419,
 262420,
 277,
 281,
 262429,
 286,
 262431,
 289,
 262434,
 262440,
 297,
 299,
 262444,
 262443,
 307,
 308,
 262454,
 318,
 262465,
 262473,
 333,
 336,
 346,
 348,
 262492,
 262494,
 355,
 262500,
 357,
 262502,
 262506,
 262507,
 362,
 368,
 262513,
 262518,


In [19]:
#Check if the new unfrequent product url id is not in the
assert UNFREQUENT_PRODUCT_URL_ID not in product_urls_min_freq_set

### Checking how many products have at least 5 interactions

In [23]:
categories_mapping = pd.read_parquet(os.path.join(DATA_PATH, 'categories/mapping_product_sku_without_urls.parquet'))

In [24]:
len(categories_mapping)

60560

In [25]:
len(categories_mapping.merge(product_urls_min_freq_series.to_frame('count'), left_on='encoded_product_sku', right_index=True,
                         how='inner'))

30158

## Processing validation set for training and evaluation

In [26]:
def replace_unfreq_items(df, item_id_seq_col):
    df[item_id_seq_col+"_unfreq"] = df[item_id_seq_col].apply(lambda seq: [2 if x in product_urls_min_freq_set else 1 for x in seq])
    df[item_id_seq_col] = df[item_id_seq_col].apply(lambda seq: [x if x in product_urls_min_freq_set else UNFREQUENT_PRODUCT_URL_ID for x in seq])    

In [27]:
def process_search_features(df, impressions_size_moments, clicks_size_moments):
    #replace_unfreq_items(df, 'flat_clicked_skus_hash')
    #Truncating
    df['flat_clicked_skus_hash_trunc'] = df['flat_clicked_skus_hash'].apply(lambda x: x[-MAX_QUERY_CLICKS_PER_SESSION:])   
    
    df['flat_query_vector_trunc'] = df['flat_query_vector'].apply(lambda x: x[-MAX_QUERIES_PER_SESSION*QUERY_VECTOR_DIM:])
    #Truncating and apply standardization to impressions_size and clicks_size
    df['impressions_size_trunc_norm'] = df['impressions_size'] \
            .apply(lambda x: [(y-impressions_size_moments['mean']) / impressions_size_moments['std'] for y in x[-MAX_QUERIES_PER_SESSION:]])
    df['clicks_size_trunc_norm'] = df['clicks_size'] \
            .apply(lambda x: [(y-clicks_size_moments['mean']) / clicks_size_moments['std'] for y in x[-MAX_QUERIES_PER_SESSION:]])
    return df

In [28]:
def keep_only_first_item_interaction(values):
    result = []
    for v in values:
        if v not in result:
            result.append(v)
    return result

In [29]:
def process_valid_sessions_for_train(path):
    df = pd.read_parquet(path)

    #Truncating validation sessions, keeping the end
    for col in SEQ_COLS:
        if col != 'product_url_hash_list_unfreq':
            df[col] = df[col].apply(lambda x: x[-MAX_SESSION_LENGTH:])

    return df

In [30]:
def process_valid_sessions_for_eval(path):
    df = pd.read_parquet(path)    
    #Filling nulls to make validation set compatible with train and test sets (temporary)
    #df['product_sku_hash_list'] = df['product_sku_hash_list'].apply(lambda d: d if d is not None else [])
    
    session_items_eval_df = pd.DataFrame(df[['product_url_hash_list', 'event_type-list']])
    # Reserving the last session items for validation (labels)
    sessions_product_skus = []
    for idx, row in session_items_eval_df[['product_url_hash_list', 'event_type-list']].iterrows():
        # Keeping only product sku events in the labels (removing page views form labels)
        session_product_skus = list(map(lambda y: y[0], filter(lambda x: x[1] == EVENT_TYPE_PRODUCT, 
                                              zip(row['product_url_hash_list'][-(len(row['product_url_hash_list'])//2):], 
                                                  row['event_type-list'][-(len(row['product_url_hash_list'])//2):]))))
        # Removing from the labels items that appear in the first half of the session (keep only the unseen ones), but if the label is an unfrequent item keeps it (as predictions will almost always be wrong in this case)
        # and also removing repeated items in the labels
        session_beginning = row['product_url_hash_list'][:-(len(row['product_url_hash_list'])//2)][-MAX_SESSION_LENGTH:]
        session_product_skus = keep_only_first_item_interaction([item for item in session_product_skus \
                      if (item not in session_beginning) or (item == UNFREQUENT_PRODUCT_URL_ID)]) \
                                [:MAX_LENGTH_FOR_F1]
        sessions_product_skus.append(session_product_skus)        
    session_items_eval_df['labels'] = sessions_product_skus
    del(session_items_eval_df['product_url_hash_list'])
    del(session_items_eval_df['event_type-list'])

    #Keeps in the validation set only those that have at least one label
    valid_labels_mask = session_items_eval_df['labels'].apply(len).astype(bool)
    session_items_eval_df = session_items_eval_df[valid_labels_mask]
    df = df[valid_labels_mask]
    

    replace_unfreq_items(df, 'product_url_hash_list')
    for col in SEQ_COLS:
        df[col] = df[col].apply(lambda x: x[:-(len(x)//2)][-MAX_SESSION_LENGTH:])

    return df, session_items_eval_df

In [31]:
def get_num_row_groups(path):
    return ParquetFile(path).num_row_groups

In [33]:
for fold in range(1,FOLDS+1):
    print(f"Fold {fold}")
    
    #Ensuring parquet files are split into more row groups
    train_path = os.path.join(DATA_PATH, f'train-{fold}.parquet')
    train_df = pd.read_parquet(train_path)
        
    impressions_size_moments = train_df[['impressions_size']].explode('impressions_size').agg(['mean', 'std'])['impressions_size'].to_dict()
    
    # Filling null clicks_size
    train_df.loc[train_df['clicks_size'].isna(), 'clicks_size'] = train_df[train_df['clicks_size'].isna()]['impressions_size']
    clicks_size_moments = train_df[['clicks_size']].explode('clicks_size').agg(['mean', 'std'])['clicks_size'].to_dict()    
    print('Train set - impressions_size_moments', impressions_size_moments, ' - clicks_size_moments', clicks_size_moments)
    
    replace_unfreq_items(train_df, 'product_url_hash_list')
    train_df = process_search_features(train_df, impressions_size_moments, clicks_size_moments)

    train_df.to_parquet(train_path.replace('train-', 'train-freqcap-'), row_group_size=ROW_GROUP_SIZE)    
    print("- train (row groups)", get_num_row_groups(train_path))
    test_path = os.path.join(DATA_PATH, f'test-{fold}.parquet')
    test_df = pd.read_parquet(test_path)
    # Filling null clicks_size
    test_df.loc[test_df['clicks_size'].isna(), 'clicks_size'] = test_df[test_df['clicks_size'].isna()]['impressions_size']
    replace_unfreq_items(test_df, 'product_url_hash_list')
    test_df = process_search_features(test_df, impressions_size_moments, clicks_size_moments)
    test_df.to_parquet(test_path.replace('test-', 'test-freqcap-'), row_group_size=ROW_GROUP_SIZE)
    print("- test (row groups)", get_num_row_groups(test_path))
    
    #Processing validation set
    # - valid-train: truncate the begining of sessions up to length 30
    # - valid-eval: keeps the first half of the session for inference
    # - valid-eval-labels keeps the second half of the session (only product skus -> labels)
    valid_path = os.path.join(DATA_PATH, f'valid-{fold}.parquet')    
    valid_sessions_for_train_df = process_valid_sessions_for_train(valid_path)
    # Filling null clicks_size
    valid_sessions_for_train_df.loc[valid_sessions_for_train_df['clicks_size'].isna(), 'clicks_size'] = valid_sessions_for_train_df[valid_sessions_for_train_df['clicks_size'].isna()]['impressions_size']        
    valid_sessions_for_train_df = process_search_features(valid_sessions_for_train_df, impressions_size_moments, clicks_size_moments)
    replace_unfreq_items(valid_sessions_for_train_df, 'product_url_hash_list')
    #Just to ensure the dtypes are the same for all datasets
    #sessions_for_train_cdf['product_sku_hash_count'] = sessions_for_train_cdf['product_sku_hash_count'].astype('int32')
    valid_train_path = os.path.join(DATA_PATH, f"valid-train-freqcap-{fold}.parquet")
    valid_sessions_for_train_df.to_parquet(valid_train_path, row_group_size=ROW_GROUP_SIZE)
    print("- valid-train (row groups)", get_num_row_groups(valid_train_path))
    
    valid_sessions_for_eval_df, valid_sessions_for_eval_label_df = process_valid_sessions_for_eval(valid_path)
    # Filling null clicks_size
    valid_sessions_for_eval_df.loc[valid_sessions_for_eval_df['clicks_size'].isna(), 'clicks_size'] = valid_sessions_for_eval_df[valid_sessions_for_eval_df['clicks_size'].isna()]['impressions_size']
    valid_sessions_for_eval_df = process_search_features(valid_sessions_for_eval_df, impressions_size_moments, clicks_size_moments)
    valid_sessions_for_eval_df.to_parquet(os.path.join(DATA_PATH, f"valid-eval-freqcap-{fold}.parquet"), row_group_size=ROW_GROUP_SIZE)
    valid_sessions_for_eval_label_df.to_parquet(os.path.join(DATA_PATH, f"valid-eval-labels-freqcap-{fold}.parquet"), row_group_size=ROW_GROUP_SIZE)

Fold 1
Train set - impressions_size_moments {'mean': 2.709584751920021, 'std': 7.352337650808475}  - clicks_size_moments {'mean': 0.10859004928207908, 'std': 1.5545850842091755}
- train (row groups) 56
- test (row groups) 7
- valid-train (row groups) 12
Fold 2
Train set - impressions_size_moments {'mean': 2.7371175847315548, 'std': 7.38135499810115}  - clicks_size_moments {'mean': 0.11366166391716627, 'std': 1.572499360648172}
- train (row groups) 56
- test (row groups) 7
- valid-train (row groups) 12
Fold 3
Train set - impressions_size_moments {'mean': 2.758913523712323, 'std': 7.4101361314168015}  - clicks_size_moments {'mean': 0.11233557822078681, 'std': 1.2762469689320457}
- train (row groups) 56
- test (row groups) 7
- valid-train (row groups) 12
Fold 4
Train set - impressions_size_moments {'mean': 2.7527935422468186, 'std': 7.400888641299158}  - clicks_size_moments {'mean': 0.11178701097274188, 'std': 1.2905269307050415}
- train (row groups) 56
- test (row groups) 7
- valid-train

In [34]:
len(valid_sessions_for_train_df)

115004

In [35]:
len(valid_sessions_for_eval_df)

61459

In [36]:
valid_sessions_for_train_df['product_url_hash_list'].apply(lambda x: set(x) == set([UNFREQUENT_PRODUCT_URL_ID])).mean()

0.0009043163716044659

In [37]:
valid_sessions_for_eval_df['product_url_hash_list'].apply(lambda x: set(x) == set([UNFREQUENT_PRODUCT_URL_ID])).mean()

0.000894905546787289

In [38]:
valid_sessions_for_eval_df['product_url_hash_list'].apply(lambda x: UNFREQUENT_PRODUCT_URL_ID in x).mean()

0.05496347158268114

In [39]:
valid_sessions_for_eval_label_df['labels'].apply(lambda x: UNFREQUENT_PRODUCT_URL_ID in x).mean()

0.0

In [40]:
valid_sessions_for_train_df['product_url_hash_list'].apply(len).describe()

count    115004.000000
mean          5.736009
std           5.699338
min           2.000000
25%           2.000000
50%           4.000000
75%           7.000000
max          30.000000
Name: product_url_hash_list, dtype: float64

In [41]:
valid_sessions_for_eval_df['product_url_hash_list'].apply(len).describe()

count    61459.000000
mean         4.044078
std          4.209953
min          1.000000
25%          1.000000
50%          3.000000
75%          5.000000
max         30.000000
Name: product_url_hash_list, dtype: float64

In [42]:
valid_sessions_for_eval_label_df['labels'].apply(len).describe()

count    61459.000000
mean         1.697896
std          1.573882
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         20.000000
Name: labels, dtype: float64