### Import libraries

In [1]:
%%time
import os
import sys
import copy
from datetime import datetime
import gc
import pickle as pkl
import shelve
import warnings

import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
import numpy as np
    
from hnm_utils.load_data import  load_articles, load_customers, load_transactions, load_submission
from hnm_utils import cv
from hnm_utils import fe
from hnm_utils import modeling
from hnm_utils import candidates_retrieve
from config import sub_params, cv_params

CPU times: total: 2.12 s
Wall time: 11.7 s


In [2]:
customers_path = 'reduced_memory_data/customers.csv'
articles_path = 'reduced_memory_data/articles.csv'
transactions_path = 'reduced_memory_data/transactions_train.csv'

### Load and convert data

In [3]:
%%time
transactions_df = load_transactions(transactions_path)
articles_df = load_articles(articles_path)
customers_df = load_customers(customers_path)

transactions_df['week_number'] = fe.cal_week_numbers(transactions_df['t_dat'])
transactions_df['t_dat'] = fe.cal_day_numbers(transactions_df['t_dat'])

CPU times: total: 36.6 s
Wall time: 38.1 s


### Get item pairs

In [4]:
%%time
pairs_per_item = 5

week_number_pairs = {}
if os.path.exists('week_number_pairs.pkl'):
    with open('week_number_pairs.pkl', 'rb') as f:
        week_number_pairs = pkl.load(f)
else:
    for week_number in [96, 97, 98, 99, 100, 101, 102, 103, 104]:
        print(f"Creating pairs for week number {week_number}")
        week_number_pairs[week_number] = fe.create_pairs(transactions_df, week_number, pairs_per_item)
    with open('week_number_pairs.pkl', 'wb') as f:
        pkl.dump(week_number_pairs, f)

CPU times: total: 156 ms
Wall time: 177 ms


### Main retrieval/features function

In [5]:
def generate_candidates(t, c, a, customer_batch=None, **kwargs):
    features_df, label_df = cv.feature_label_split(
        t, kwargs["label_week"], kwargs["feature_periods"]
    )
    
    # converting relative day_number
    features_df["t_dat"] = features_df["t_dat"] - features_df['t_dat'].max()
    features_df["week_number"] = features_df["week_number"] - features_df['week_number'].max()
    
    # pull out the cv week
    article_pairs_df = week_number_pairs[kwargs["label_week"]-1]
    
    # check if we can limit customers
    if len(label_df) > 0:
        customers = label_df["customer_id"].unique()
    elif customer_batch is not None:
        customers = customer_batch
    else:
        customers = None
    
    ############################################
    # creating candidates (and adding features)
    ###########################################
    
    features_db = shelve.open("features_db") 

    (candidates_with_same_product_code, 
    features_db["candidates_same_product_code"]) = candidates_retrieve.create_candidates_with_same_product_code(
        features_df,
        articles_df,
        week_length=3,
        customers=customers,
        )
    
    
    # creating candidate (and saving features created)
    recent_customer_cand, features_db["customer_article"] = (
        candidates_retrieve.create_recent_customer_candidates(
            features_df,
            kwargs["ca_num_weeks"],
            customers=customers,
        )
    )
    
    (cust_last_week_cand,
     cust_last_week_pair_cand,
     features_db["clw"],
     features_db["clw_pairs"]) = candidates_retrieve.create_last_customer_weeks_and_pairs(
        features_df,
        article_pairs_df,
        kwargs["clw_num_weeks"],
        kwargs["clw_num_pair_weeks"],
        customers=customers,
    )
    
    popular_candidates, features_db["popular_articles"] = candidates_retrieve.create_popular_article_cand(
        features_df,
        c,
        a,
        kwargs["pa_num_weeks"],
        kwargs["hier_col"],
        num_candidates=kwargs["num_recent_candidates"],
        num_articles=kwargs["num_recent_articles"],
        customers=customers,
    )
    age_bucket_can, _, _ = candidates_retrieve.create_age_bucket_candidates(
        features_df,
        c,
        kwargs["num_age_buckets"],
        articles=kwargs["num_recent_articles"],
        customers=customers,
    )
    
    cand = [recent_customer_cand, cust_last_week_cand, cust_last_week_pair_cand, age_bucket_can, popular_candidates, candidates_with_same_product_code]
    cand = pd.concat(cand).drop_duplicates()
    cand = cand.sort_values(["customer_id", "article_id"]).reset_index(drop=True)
    
    del recent_customer_cand, cust_last_week_cand, cust_last_week_pair_cand, age_bucket_can, popular_candidates, candidates_with_same_product_code
    
    cand = candidates_retrieve.filter_candidates(cand, t, **kwargs)
    
    # creating other features
    fe.create_cust_hier_features(features_df, a, kwargs["hier_cols"], features_db)
    fe.create_price_features(features_df, features_db)
    fe.create_cust_features(c, features_db)
    fe.create_article_cust_features(features_df, c, features_db)
    fe.create_lag_features(features_df, a, kwargs["lag_days"], features_db)
    fe.create_rebuy_features(features_df, features_db)
    fe.create_cust_t_features(features_df, a, features_db)
    fe.create_art_t_features(features_df, features_db)
    
    del features_df

    # another filter at the end, for the ones that didn't get filtered earlier
    if customers is not None:
        cand = cand[cand["customer_id"].isin(customers)]
    
    # report on recall/precision of candidates
    if kwargs["cv"]:
        ground_truth_candidates = label_df[["customer_id", "article_id"]].drop_duplicates()
        cv.report_candidates(cand, ground_truth_candidates)
        del ground_truth_candidates        
    
    # adding features to candidates
    cand_with_f_df = candidates_retrieve.add_features_to_candidates(
        cand, features_db, c, a
    )
    
    # manually adding article features (couldn't use shelve for some reason)
    for article_col in kwargs["article_columns"]:
        art_col_map = a.set_index("article_id")[article_col]
        cand_with_f_df[article_col] = cand_with_f_df["article_id"].map(art_col_map)
    
    # limiting features
    if kwargs["selected_features"] is not None:
        cand_with_f_df = cand_with_f_df[
            ["customer_id", "article_id"] + kwargs["selected_features"]
        ]
        
    features_db.close()
    os.remove("features_db.bak"), os.remove("features_db.dir"), os.remove("features_db.dat")
    
    assert len(cand) == len(cand_with_f_df), "seem to have duplicates in the feature dfs"
    del cand
    
    return cand_with_f_df, label_df

In [6]:
def report_model_performance(ids_df, preds, truth_df):
    predictions = modeling.create_predictions(ids_df, preds)
    true_labels = cv.ground_truth(truth_df).set_index("customer_id")["prediction"]
    score = round(cv.cal_mapk(true_labels, predictions), 5)
    return score

### Cross validation

In [7]:
%%time
cv_weeks = [104]
results = modeling.run_all_cvs(
    transactions_df, customers_df, articles_df, generate_candidates, report_model_performance, 
    cv_weeks=cv_weeks, **cv_params
)

preparing training modeling dfs for 103...
######################### Retrieval Report ##############################
Number of hit candidates: 18793
Number of actual articles: 214824
Number of candidates: 2917628
Precision: 0.006441191269072
Recall: 0.08748091460916843
########################################################################
preparing evaluation modeling dfs...
######################### Retrieval Report ##############################
Number of hit candidates: 17836
Number of actual articles: 189896
Number of candidates: 2425844
Precision: 0.007352492575779811
Recall: 0.09392509584193454
########################################################################
[LightGBM] [Info] Total groups: 14066, total data: 765898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8035
[LightGBM] [Info] Number of data points in the train set: 76589

### Train

In [8]:
%%time
gc.collect()
modeling.full_sub_train_run(
    transactions_df,
    customers_df,
    articles_df,
    generate_candidates, report_model_performance, **sub_params
)

preparing training modeling dfs for 104...
preparing training modeling dfs for 103...
preparing training modeling dfs for 102...
preparing training modeling dfs for 101...
preparing training modeling dfs for 100...
concatenating all weeks together
[LightGBM] [Info] Total groups: 73707, total data: 3931373
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.268251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8108
[LightGBM] [Info] Number of data points in the train set: 3931373, number of used features: 45
[10]	train's map@12: 0.262413	train's ndcg@12: 0.352122
[20]	train's map@12: 0.26937	train's ndcg@12: 0.360169
[30]	train's map@12: 0.274092	train's ndcg@12: 0.365779
[40]	train's map@12: 0.277274	train's ndcg@12: 0.369337
[50]	train's map@12: 0.280009	train's ndcg@12: 0.372759
[60]	train's map@12: 0.283167	train's ndcg@12: 0.376047
[70

### Predict

In [9]:
%%time
predictions = modeling.full_sub_predict_run(
    transactions_df,
    customers_df,
    articles_df, 
    generate_candidates,
    **sub_params
)

Generating candidates/features for batch #1 of 3
candidate/features shape of batch: (13,734,598, 45)
predicting with 'model_104'
predicting with 'model_105'
Generating candidates/features for batch #2 of 3
candidate/features shape of batch: (13,740,094, 45)
predicting with 'model_104'
predicting with 'model_105'
Generating candidates/features for batch #3 of 3
candidate/features shape of batch: (13,715,746, 45)
predicting with 'model_104'
predicting with 'model_105'


In [None]:
with open(f'predictions/predictions_{datetime.now().date()}.pkl', 'wb') as f:
    pkl.dump(predictions, f)

### Make submission

In [None]:
sub = load_submission('raw_data/sample_submission.csv')
sub['customer_id_2'] = sub['customer_id'].str[-16:].apply(lambda x: int(x, 16)).astype('int64')
sub = sub.merge(predictions.reset_index().rename({'customer_id':'customer_id_2'},axis=1), on='customer_id_2', how='left').fillna('')
sub.drop('prediction', axis=1, inplace=True)
sub.rename({'article_id':'prediction'}, axis=1, inplace=True)
sub.drop('customer_id_2', axis=1, inplace=True)

In [None]:
def remove_duplicates(orig_list: list) -> list:
    """Remove duplicates from python list, while retaining order"""

    unique_list = []
    seen = set()
    for x in orig_list:
        if x not in seen:
            unique_list.append(x)
            seen.add(x)
    return unique_list

sub['prediction'] = sub['prediction'].apply(remove_duplicates)
sub["prediction"] = sub["prediction"].apply(lambda x: x[:12])
sub["prediction"] = sub["prediction"].apply(
    lambda x: " ".join(["0" + str(article_id) for article_id in x])
)

In [None]:
sub.to_csv(f'submissions/submission_{datetime.now().date()}.csv',index=False)