# Importing & Defining Utiliy Funtions

In [None]:
!pip install -q --no-deps ../input/tfrs-whl-installation/scann-1.2.3-cp37-cp37m-manylinux2014_x86_64.whl
!pip install -q --no-deps ../input/tfrs-whl-installation/tensorflow_recommenders-0.6.0-py3-none-any.whl

In [None]:
import os
import sys
os.environ["WANDB_API_KEY"] = "6f810b088fcc6b9eaaa56c1e52cfd37836606240"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import wandb
from wandb.keras import WandbCallback
import shutil
from glob import glob
from IPython.display import Image, display
import psutil

from multiprocessing import cpu_count, Process
import copy
import warnings
from datetime import datetime, timedelta
from time import time, sleep, mktime
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm, rc, rcParams
from tqdm import tqdm
import re
import random as rnd
from collections import Counter
import gc

import numpy as np
from numpy import array, nan, random as np_rnd, where
import pandas as pd
from pandas import DataFrame as dataframe, Series as series, isna, read_csv
from pandas.tseries.offsets import DateOffset

from sklearn.model_selection import train_test_split as tts, GridSearchCV as GridTuner, StratifiedKFold, KFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, KBinsDiscretizer
from sklearn import metrics

from optuna import distributions as optuna_dist, visualization as optuna_plt, Trial, create_study, pruners
from optuna.integration import OptunaSearchCV
from optuna.samplers import TPESampler
from optuna.logging import set_verbosity as optuna_set_verbose
from optuna.logging import ERROR as optuna_error_verbose

# ===== tensorflow =====
import tensorflow as tf
from tensorflow import random as tf_rnd
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import activations
from tensorflow.keras import optimizers
from tensorflow.keras import metrics as tf_metrics
from tensorflow.keras import callbacks as tf_callbacks
from tqdm.keras import TqdmCallback
import tensorflow_addons as tfa
from tensorflow.keras.utils import plot_model

import tensorflow_recommenders as tfrs
import scann


warnings.filterwarnings(action='ignore')
rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 200)

# # GPU check
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   print('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#   try:
#     tf.config.experimental.set_memory_growth(gpus[0], True)
#   except RuntimeError as e:
#     print(e)

import cudf
import cupy as cp
from cuml.neighbors import KNeighborsRegressor, KNeighborsClassifier

from numba import cuda 
device = cuda.get_current_device()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ===== utility functions =====
# label encoding for categorical column with excepting na value
def seed_everything(seed=42):
    # python-self
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random module
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    tf_rnd.set_seed(seed)
    # RAPIDS random
    cp.random.seed(seed)
def which(bool_list):
    return where(bool_list)[0]
def easyIO(x=None, path=None, op="r", keras_inspection=False):
    tmp = None
    if op == "r":
        with open(path, "rb") as f:
            tmp = pickle.load(f)
        return tmp
    elif op == "w":
        print(x)
        tmp = x
        if keras_inspection:
            tmp = {}
            if type(x) is dict:
                for k in x.keys():
                    if "MLP" in k:
                        tmp[k] = {}
                        for model_comps in x[k].keys():
                            if model_comps != "model":
                                tmp[k][model_comps] = copy.deepcopy(x[k][model_comps])
                        print(F"INFO : {k} model is removed (keras)")
                    else:
                        tmp[k] = x[k]
        if input("Write [y / n]: ") == "y":
            with open(path, "wb") as f:
                pickle.dump(tmp, f)
            print("operation success")
        else:
            print("operation fail")
    else:
        print("Unknown operation type")
def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]
def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]
def orderElems(for_order, using_ref):
    return [i for i in using_ref if i in for_order]
# concatenate by row
def cbr(df1, df2):
    if type(df1) == series:
        tmp_concat = series(pd.concat([dataframe(df1), dataframe(df2)], axis=0, ignore_index=True).iloc[:,0])
        tmp_concat.reset_index(drop=True, inplace=True)
    elif type(df1) == dataframe:
        tmp_concat = pd.concat([df1, df2], axis=0, ignore_index=True)
        tmp_concat.reset_index(drop=True, inplace=True)
    elif type(df1) == np.ndarray:
        tmp_concat = np.concatenate([df1, df2], axis=0)
    else:
        print("Unknown Type: return 1st argument")
        tmp_concat = df1
    return tmp_concat
def change_width(ax, new_value):
    for patch in ax.patches :
        current_width = patch.get_width()
        adj_value = current_width - new_value
        # we change the bar width
        patch.set_width(new_value)
        # we recenter the bar
        patch.set_x(patch.get_x() + adj_value * .5)
def week_of_month(date):
    month = date.month
    week = 0
    while date.month == month:
        week += 1
        date -= timedelta(days=7)
    return week
def getSeason(date):
    month = date.month
    if month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    elif month in [9, 10, 11]:
        return "Fall"
    else:
        return "Winter"
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)
def softmax(x):
    max = np.max(x, axis=1, keepdims=True)  # returns max of each row and keeps same dims
    e_x = np.exp(x - max)  # subtracts each row with its max value
    sum = np.sum(e_x, axis=1, keepdims=True)  # returns sum of each row and keeps same dims
    f_x = e_x / sum
    return f_x
def sigmoid(x):
    return 1/(1 + np.exp(-x))
def dispPerformance(result_dic):
    perf_table = dataframe()
    index_names = []
    for k, v in result_dic.items():
        index_names.append(k)
        perf_table = pd.concat([perf_table, series(v["performance"]).to_frame().T], ignore_index=True, axis=0)
    perf_table.index = index_names
    perf_table.sort_values(perf_table.columns[0], inplace=True)
    print(perf_table)
    return perf_table
def powspace(start, stop, power, num):
    start = np.power(start, 1/float(power))
    stop = np.power(stop, 1/float(power))
    return np.power(np.linspace(start, stop, num=num), power)
def xgb_custom_lossfunction(alpha = 1):
    def support_under_mse(label, pred):
        # grad : 1차 미분
        # hess : 2차 미분
        residual = (label - pred).astype("float")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return support_under_mse
def pd_flatten(df):
    df = df.unstack()
    df.index = [str(i) + "_" + str(j) for i, j in df.index]
    return df
def tf_loss_rmse(y_true, y_pred, sample_weight=False):
    return tf.sqrt(tf.reduce_mean((y_true - y_pred) ** 2)) if not sample_weight else tf.sqrt(tf.reduce_mean(((y_true - y_pred) ** 2) * sample_weight))
def text_extractor(string, lang="eng", spacing=True):
    # # 괄호를 포함한 괄호 안 문자 제거 정규식
    # re.sub(r'\([^)]*\)', '', remove_text)
    # # <>를 포함한 <> 안 문자 제거 정규식
    # re.sub(r'\<[^)]*\>', '', remove_text)
    if lang == "eng":
        text_finder = re.compile('[^ A-Za-z]') if spacing else re.compile('[^A-Za-z]')
    elif lang == "kor":
        text_finder = re.compile('[^ ㄱ-ㅣ가-힣+]') if spacing else re.compile('[^ㄱ-ㅣ가-힣+]')
    # default : kor + eng
    else:
        text_finder = re.compile('[^ A-Za-zㄱ-ㅣ가-힣+]') if spacing else re.compile('[^A-Za-zㄱ-ㅣ가-힣+]')
    return text_finder.sub('', string)
def memory_usage(message='debug'):
    # current process RAM usage
    p = psutil.Process()
    rss = p.memory_info().rss / 2 ** 20 # Bytes to MB
    print(f"[{message}] memory usage: {rss: 10.3f} MB")
    return rss
class MyLabelEncoder:
    def __init__(self, preset={}):
        # dic_cat format -> {"col_name": {"value": replace}}
        self.dic_cat = preset
    def fit_transform(self, data_x, col_names):
        tmp_x = copy.deepcopy(data_x)
        for i in col_names:
            # if key is not in dic, update dic
            if i not in self.dic_cat.keys():
                tmp_dic = dict.fromkeys(sorted(set(tmp_x[i]).difference([nan])))
                label_cnt = 0
                for j in tmp_dic.keys():
                    tmp_dic[j] = label_cnt
                    label_cnt += 1
                self.dic_cat[i] = tmp_dic
            # transform value which is not in dic to nan
            tmp_x[i] = tmp_x[i].astype("object")
            conv = tmp_x[i].replace(self.dic_cat[i])
            for conv_idx, j in enumerate(conv):
                if j not in self.dic_cat[i].values():
                    conv[conv_idx] = nan
            # final return
            tmp_x[i] = conv.astype("float")
        return tmp_x
    def transform(self, data_x):
        tmp_x = copy.deepcopy(data_x)
        for i in self.dic_cat.keys():
            # transform value which is not in dic to nan
            tmp_x[i] = tmp_x[i].astype("object")
            conv = tmp_x[i].replace(self.dic_cat[i])
            for conv_idx, j in enumerate(conv):
                if j not in self.dic_cat[i].values():
                    conv[conv_idx] = nan
            # final return
            tmp_x[i] = conv.astype("float")
        return tmp_x
    def clear(self):
        self.dic_cat = {}
class MyOneHotEncoder:
    def __init__(self, label_preset={}):
        self.dic_cat = {}
        self.label_preset = label_preset
    def fit_transform(self, data_x, col_names):
        tmp_x = dataframe()
        for i in data_x:
            if i not in col_names:
                tmp_x = pd.concat([tmp_x, dataframe(data_x[i])], axis=1)
            else:
                if not ((data_x[i].dtype.name == "object") or (data_x[i].dtype.name == "category")):
                    print(F"WARNING : {i} is not object or category")
                self.dic_cat[i] = OneHotEncoder(sparse=False, handle_unknown="ignore")
                conv = self.dic_cat[i].fit_transform(dataframe(data_x[i])).astype("int")
                col_list = []
                for j in self.dic_cat[i].categories_[0]:
                    if i in self.label_preset.keys():
                        for k, v in self.label_preset[i].items():
                            if v == j:
                                col_list.append(str(i) + "_" + str(k))
                    else:
                        col_list.append(str(i) + "_" + str(j))
                conv = dataframe(conv, columns=col_list)
                tmp_x = pd.concat([tmp_x, conv], axis=1)
        return tmp_x
    def transform(self, data_x):
        tmp_x = dataframe()
        for i in data_x:
            if not i in list(self.dic_cat.keys()):
                tmp_x = pd.concat([tmp_x, dataframe(data_x[i])], axis=1)
            else:
                if not ((data_x[i].dtype.name == "object") or (data_x[i].dtype.name == "category")):
                    print(F"WARNING : {i} is not object or category")
                conv = self.dic_cat[i].transform(dataframe(data_x[i])).astype("int")
                col_list = []
                for j in self.dic_cat[i].categories_[0]:
                    if i in self.label_preset.keys():
                        for k, v in self.label_preset[i].items():
                            if v == j: col_list.append(str(i) + "_" + str(k))
                    else:
                        col_list.append(str(i) + "_" + str(j))
                conv = dataframe(conv, columns=col_list)
                tmp_x = pd.concat([tmp_x, conv], axis=1)
        return tmp_x
    def clear(self):
        self.dic_cat = {}
        self.label_preset = {}
class MyKNNImputer:
    def __init__(self, k=5):
        self.imputer = KNNImputer(n_neighbors=k)
        self.dic_cat = {}
    def fit_transform(self, x, cat_vars=None):
        if cat_vars is None:
            x_imp = dataframe(self.imputer.fit_transform(x), columns=x.columns)
        else:
            naIdx = dict.fromkeys(cat_vars)
            for i in cat_vars:
                self.dic_cat[i] = diff(list(sorted(set(x[i]))), [nan])
                naIdx[i] = list(which(array(x[i].isna())))
            x_imp = dataframe(self.imputer.fit_transform(x), columns=x.columns)

            # if imputed categorical value are not in the range, adjust the value
            for i in cat_vars:
                x_imp[i] = x_imp[i].apply(lambda x: int(round(x, 0)))
                for j in naIdx[i]:
                    if x_imp[i][j] not in self.dic_cat[i]:
                        if x_imp[i][j] < self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][0]
                        elif x_imp[i][j] > self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][len(self.dic_cat[i]) - 1]
        return x_imp
    def transform(self, x):
        if len(self.dic_cat.keys()) == 0:
            x_imp = dataframe(self.imputer.transform(x), columns=x.columns)
        else:
            naIdx = dict.fromkeys(self.dic_cat.keys())
            for i in self.dic_cat.keys():
                naIdx[i] = list(which(array(x[i].isna())))
            x_imp = dataframe(self.imputer.transform(x), columns=x.columns)

            # if imputed categorical value are not in the range, adjust the value
            for i in self.dic_cat.keys():
                x_imp[i] = x_imp[i].apply(lambda x: int(round(x, 0)))
                for j in naIdx[i]:
                    if x_imp[i][j] not in self.dic_cat[i]:
                        if x_imp[i][j] < self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][0]
                        elif x_imp[i][j] > self.dic_cat[i][0]:
                            x_imp[i][naIdx[i]] = self.dic_cat[i][len(self.dic_cat[i]) - 1]
        return x_imp
    def clear(self):
        self.imputer = None
        self.dic_cat = {}

In [None]:
class RandomCropAndResize(tf.keras.layers.Layer):
    def __init__(self, crop_ratio=0.8, resize_img_size=128, resize_channels=3, **kwargs):
        super(RandomCropAndResize, self).__init__(**kwargs)
        self.img_size = resize_img_size
        self.channels = resize_channels
        self.crop_size = int(self.img_size * self.crop_ratio)
    def call(self, input_image, training=False):
        if training:
            input_image = tf.image.random_crop(input_image, [self.crop_size, self.crop_size, self.channels])
            input_image = tf.image.resize(input_image, [self.resize_img_size, self.resize_img_size])
            return input_image
        else:
            return input_image
def RandomAngleDistortion(rotation_factor=0.2, flip=False):
    if flip:
        model = Sequential([
            layers.RandomRotation(factor=(-rotation_factor, rotation_factor)),
            layers.RandomFlip("horizontal")
        ])
        return model
    else:
        model = Sequential([
            layers.RandomRotation(factor=(-rotation_factor, rotation_factor)),
        ])
        return model
class RandomColorDistortion(tf.keras.layers.Layer):
    def __init__(self, saturation_factor=(0.8, 1.2), contrast_factor=(0.8, 1.2), brightness_factor=0.2, hue_factor=0.2, hue_flag=True, **kwargs):
        super(RandomColorDistortion, self).__init__(**kwargs)
        self.saturation_factor = saturation_factor
        self.contrast_factor = contrast_factor
        self.brightness_factor = brightness_factor
        self.hue_factor = hue_factor
        self.hue_flag = hue_flag
    def call(self, input_image, training=False):
        if training:
            input_image = tf.image.random_saturation(input_image, self.saturation_factor[0], self.saturation_factor[1])
            input_image = tf.image.random_contrast(input_image, self.contrast_factor[0], self.contrast_factor[1])
            input_image = tf.image.random_brightness(input_image, self.brightness_factor)
            input_image = tf.image.random_hue(input_image, self.hue_factor) if self.hue_flag else input_image
            return input_image
        else:
            return input_image

# Preprocessing & Feature Engineering

In [None]:
seed_everything()

In [None]:
user_conv_rf = dict.fromkeys(["customer_id"], "object")
content_conv_ref = dict.fromkeys(["article_id"], "object")
rating_conv_rf = dict.fromkeys(["customer_id", "article_id", "sales_channel_id"], "object")

In [None]:
user = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
content = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype=["object"])
rating = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype=["object"], parse_dates=["t_dat"])

In [None]:
# filtering no image data
content_mask = read_csv("../input/h-m-image-exists/content_exists_mask.csv")
rating_mask = read_csv("../input/h-m-image-exists/rating_exist_mask.csv")
content = content[content_mask.iloc[:,0]==1]
content.reset_index(drop=True, inplace=True)
rating = rating[rating_mask.iloc[:,0]==1]
rating.reset_index(drop=True, inplace=True)
del content_mask, rating_mask

**Reducing memory**

In [None]:
user.drop("postal_code", axis=1, inplace=True)
user['customer_id'] = user['customer_id'].str[-16:].str.hex_to_int().astype('int64')
user[["FN", "Active", "age"]] = user[["FN", "Active", "age"]].astype("float32")
content = content[["article_id", "product_type_no", "graphical_appearance_no", "colour_group_code", "index_name"]]
content[["product_type_no", "graphical_appearance_no", "colour_group_code"]] = \
    content[["product_type_no", "graphical_appearance_no", "colour_group_code"]].astype("int32")
# rating.drop(["t_dat", "sales_channel_id", "price"], axis=1, inplace=True)
rating['customer_id'] = rating['customer_id'].str[-16:].str.hex_to_int().astype('int64')

**Users Table**

In [None]:
user.info()

In [None]:
# for i in ["FN", "Active", "club_member_status", "fashion_news_frequency", "postal_code"]:
#    print(user[i].value_counts(dropna=False), "\n")

**User Table**

1. FN

nan을 0으로 변환

2. Active

nan을 0으로 변환

3. club_member_status

nan을 "Prospective" 로 변환

4. fashion_news_frequency

"None" 및 nan 을 "NONE" 으로 변환

5. age

nan을 imputation

In [None]:
user["FN"].fillna(0.0, inplace=True)
user["Active"].fillna(0.0, inplace=True)
user["club_member_status"].fillna("Prospective", inplace=True)
user["fashion_news_frequency"].replace("None", "NONE", inplace=True)
user["fashion_news_frequency"].fillna("NONE", inplace=True)

In [None]:
user.info()

In [None]:
user.head()

**Label Encoding for Imputing**

In [None]:
user_label_encoder = copy.deepcopy(MyLabelEncoder())
user_label_encoder.fit_transform(user.to_pandas(), ["club_member_status", "fashion_news_frequency"])

In [None]:
optuna_set_verbose(optuna_error_verbose)
def optuna_objective_function(trial: Trial, train_x, train_y, val_x, val_y):
    tuning_params = {
        "n_neighbors" : trial.suggest_int("n_neighbors", 3, 99, step=2)
    }
    model = KNeighborsRegressor(**tuning_params)
    model.fit(train_x, train_y) 
    return metrics.mean_absolute_error(val_y.to_array(), model.predict(val_x).to_array())

In [None]:
seed_everything()
def imputing_age():
    _, full_x, _, full_y = tts(user[user["age"].notna()],
                               user["age"][user["age"].notna()],
                               test_size=100000, random_state=42,
                               stratify=pd.cut(user["age"][user["age"].notna()].to_array(), bins=[0,20,30,40,50,60,100], right=False))
    train_x, val_x, train_y, val_y = tts(full_x, full_y,
                                         test_size=0.2, random_state=42,
                                         stratify=pd.cut(full_x["age"].to_array(), bins=[0,20,30,40,50,60,100], right=False))
    test_x = user[~user["age"].notna()]

    train_x = user_label_encoder.transform(train_x.to_pandas()); train_x.drop(["customer_id", "age"], axis=1, inplace=True)
    val_x = user_label_encoder.transform(val_x.to_pandas()); val_x.drop(["customer_id", "age"], axis=1, inplace=True)

    minmax_scaler = MinMaxScaler()
    train_x = dataframe(minmax_scaler.fit_transform(train_x))
    val_x = dataframe(minmax_scaler.transform(val_x))

    optuna_set_verbose(0)
    optuna_study = create_study(direction='minimize', sampler=TPESampler())
    optuna_study.optimize(lambda trial: optuna_objective_function(
        trial, cudf.DataFrame.from_pandas(train_x), cudf.Series.from_pandas(train_y),
        cudf.DataFrame.from_pandas(val_x), cudf.Series.from_pandas(val_y)), n_trials=100)

    full_x = user_label_encoder.transform(full_x.to_pandas()); full_x.drop(["customer_id", "age"], axis=1, inplace=True)
    test_x = user_label_encoder.transform(test_x.to_pandas()); test_x.drop(["customer_id", "age"], axis=1, inplace=True)

    minmax_scaler = MinMaxScaler()
    full_x = dataframe(minmax_scaler.fit_transform(full_x))
    test_x = dataframe(minmax_scaler.transform(test_x))

    print("tuned params --->", optuna_study.best_params)
    knn = KNeighborsRegressor(**optuna_study.best_params)
    knn.fit(cudf.DataFrame.from_pandas(full_x), cudf.Series.from_pandas(full_y))
    user["age"].iloc[which(user["age"].isna().to_array())] = knn.predict(cudf.DataFrame.from_pandas(test_x)).to_array()

imputing_age()
gc.collect()

**'Age' feature imputing**

In [None]:
user.info()

In [None]:
user["age"] = user["age"].round().astype("float32")
if (user["age"] <= 0).sum() > 0:
    raise ValueError("minus value detected")

In [None]:
user["age_cat"] = pd.cut(user["age"].to_array(), bins=[0,20,30,40,50,60,100], labels=range(6), right=False, ordered=False).astype("float32")
print(user["age"].head(5))
print(user["age_cat"].head(5))

In [None]:
user.info()

In [None]:
user_ref = pd.Index(user["customer_id"].copy().to_array())

**Contents Table**

In [None]:
content.info()

In [None]:
content.isna().sum()

In [None]:
content.head()

In [None]:
# for i in content:
#     print(content[i].value_counts(), "\n")

In [None]:
# # re-organization on categories
# content["product_group_name"][content["product_group_name"] == "Underwear/nightwear"] = "Nightwear"
# content["product_group_name"][content["product_group_name"] == "Cosmetic"] = "Unknown"
# content["product_group_name"][content["product_group_name"] == "Bags"] = "Accessories"
# content["product_group_name"][content["product_group_name"] == "Items"] = "Unknown"
# content["product_group_name"][content["product_group_name"] == "Furniture"] = "Unknown"
# content["product_group_name"][content["product_group_name"] == "Garment and Shoe care"] = "Shoes"
# content["product_group_name"][content["product_group_name"] == "Stationery"] = "Accessories"
# content["product_group_name"][content["product_group_name"] == "Interior textile"] = "Unknown"
# content["product_group_name"][content["product_group_name"] == "Fun"] = "Unknown"

In [None]:
# content["product_group_name"].value_counts()

In [None]:
content_ref = pd.Index(content["article_id"].copy().to_array())
# # original
# image_rootPath = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
# resized
image_rootPath = "/kaggle/input/handm-dataset-128x128/images_128_128/"
content["article_id"] = content["article_id"].to_pandas().apply(lambda x: image_rootPath + x[:3] + "/" + x  + ".jpg").to_numpy()

**Ratings table**

In [None]:
# rating["month"] = rating["t_dat"].dt.month - 1
# rating["weekday"] = rating["t_dat"].dt.weekday
# rating["holiday"] = rating["weekday"].apply(lambda x: 1 if x in [5,6] else 0)
# day_to_sec = 24 * 60 * 60
# year_to_sec = (365.2425) * day_to_sec
# rating["yts_norm"] = rating["t_dat"].apply(lambda x: (x - pd.Timestamp(year=x.year, month=1, day=1)).total_seconds() / year_to_sec)
# rating["price_grouped"] = pd.cut(rating["price"], 7, labels=range(7), ordered=False).astype("int")

In [None]:
rating.info()

In [None]:
rating.isna().sum()

In [None]:
rating["t_dat"] = cudf.to_datetime(rating["t_dat"])
rating["year_month"] = cudf.to_datetime(rating["t_dat"].dt.strftime('%Y-%m-1'))

In [None]:
# rating.drop_duplicates(subset=["customer_id", "article_id", "month"], inplace=True, ignore_index=True)
rating.drop_duplicates(subset=["customer_id", "article_id", "year_month"], inplace=True, ignore_index=True)

In [None]:
rating["cnt"] = 1
tmp_groupby = rating.groupby("year_month").sum()
tmp_groupby.sort_index(inplace=True)

In [None]:
fig = plt.figure(figsize=(20,9))
ax = fig.add_subplot(1, 1, 1)
plt.xticks(rotation=45)
plt.title("Transaction volume by timeseries", fontsize=16, fontweight="bold", pad=20)
plt.plot(tmp_groupby.index.to_array(), tmp_groupby["cnt"].to_array())
ax.set_xticks(tmp_groupby.index.to_array())

**Train timeseries : 2018-09 ~ 2019-09**

**Validation timeseries : 2019-10 ~ 2020-09**

In [None]:
rating = rating.sample(frac=1, random_state=114).reset_index(drop=True)
stratVec = []
for i in tqdm(rating["article_id"].to_array()):
    try:
        stratVec.append(content_ref.get_loc(i))
    except:
        stratVec.append("Unknown")
stratVec = content["product_type_no"].iloc[stratVec].to_array()
# rating["stratVec"] = stratVec

In [None]:
# # for ranking task

# from tqdm.contrib import tzip
# tmp_groupby = rating.groupby(["year_month", "stratVec"]).sum()["cnt"]
# tmp_target = []
# for i, j in tzip(rating["year_month"].to_array(), rating["stratVec"].to_array()):
#     try:
#         tmp_target.append(tmp_groupby.loc[(i, j)])
#     except:
#         tmp_target.append(0)
# rating["ranking_target"] = tmp_target

# tmp_groupby = rating[rating["year_month"] <= datetime(year=2019, month=9, day=1)]
# tmp_groupby = tmp_groupby.groupby(["customer_id", "stratVec"]).sum()
# tmp_groupby = tmp_groupby["cnt"]

# tmp_target = []
# for i, j in tzip(rating["customer_id"].to_array(), rating["stratVec"].to_array()):
#     try:
#         tmp_target.append(tmp_groupby.loc[(i, j)])
#     except:
#         tmp_target.append(0)
# rating["ranking_target"] = tmp_target

In [None]:
rating.info()

In [None]:
rating.head()

In [None]:
train_mask = (rating["year_month"] <= datetime(year=2019, month=9, day=1)).to_array()

In [None]:
rating.drop(["t_dat", "sales_channel_id", "price", "year_month", "cnt"], axis=1, inplace=True)

In [None]:
del tmp_groupby; gc.collect()

# Modeling - Retrieval

**Retrieval - Users Tower & Contents Tower**

In [None]:
dropoutRate = 0.5

In [None]:
def create_model_user():
    input_list = []
    concat_list = []
    dcn_list = []
    
    # categorical feature
    input_list.append(layers.Input(shape=1, dtype=tf.int64))
    tmp_unique = user["customer_id"].unique()
    x = layers.IntegerLookup(vocabulary=tmp_unique[tmp_unique != -1].to_array(), output_mode="int")(input_list[-1])
    x = layers.Embedding(len(user["customer_id"].unique())+1, 64, embeddings_initializer="glorot_normal")(x)
    x = layers.Dropout(dropoutRate)(x)
    x = layers.Dense(16, activation="relu")(x) 
    x = layers.Dropout(dropoutRate)(x)
    dcn_list.append(layers.Dense(4)(x))
    
    input_list.append(layers.Input(shape=1, dtype=tf.string))
    x = layers.StringLookup(vocabulary=user["club_member_status"].unique().to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(user["club_member_status"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))

    input_list.append(layers.Input(shape=1, dtype=tf.string))
    x = layers.StringLookup(vocabulary=user["fashion_news_frequency"].unique().to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(user["fashion_news_frequency"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
    input_list.append(layers.Input(shape=1, dtype=tf.int32))
    x = layers.IntegerLookup(vocabulary=user["age_cat"].unique().to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(user["age_cat"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
    x = layers.Concatenate(axis=1)(dcn_list)
    dcn1 = tfrs.layers.dcn.Cross()(x, layers.Dropout(dropoutRate)(x))
    dcn2 = tfrs.layers.dcn.Cross()(x, layers.Dropout(dropoutRate)(dcn1))
    
    # DCN (Deep Cross Network for feature interaction)
    concat_list.append(layers.Flatten()(x))
    concat_list.append(layers.Flatten()(dcn1))
    concat_list.append(layers.Flatten()(dcn2))

    # numeric and binary feature
    input_list.append(layers.Input(shape=3, dtype=tf.float32))
    concat_list.append(input_list[-1])
    
    x = layers.Concatenate()(concat_list)
    block1_input = tfa.layers.NoisyDense(128, activity_regularizer="l2")(x)
    x = tfa.layers.WeightNormalization(
        layers.Dense(64, activation="relu")
    )(block1_input)
    x = layers.Dropout(dropoutRate)(x)

    block2_input = layers.Concatenate()([x, block1_input])
    x = tfa.layers.WeightNormalization(
        layers.Dense(64, activation="relu")
    )(block2_input)
    x = layers.Dropout(dropoutRate)(x)

    block3_input = layers.Concatenate()([x, block2_input])
    x = tfa.layers.WeightNormalization(
        layers.Dense(64, activation="relu")
    )(block3_input)
    x = layers.Dropout(dropoutRate)(x)
    
    x = tfa.layers.WeightNormalization(
        layers.Dense(32, activation="swish")
    )(x)

    final_embeddings = layers.Dense(4)(x)
    return Model(input_list, final_embeddings)

In [None]:
# plot_model(create_model_user(), show_shapes=True)

In [None]:
img_size = 128
channels = 3

def create_model_content():
    input_list = []
    concat_list = []
    dcn_list = []
    
    # image feature
    image_cnn_model = Sequential([
        layers.Conv2D(16, (8, 8), activation="relu", padding="same", input_shape=(img_size, img_size, channels)),
        layers.Conv2D(16, (8, 8), activation="relu", padding="same"),
        layers.MaxPooling2D(4, 4),
        layers.Dropout(dropoutRate),
        layers.Conv2D(32, (4, 4), activation="relu", padding="same"),
        layers.Conv2D(32, (4, 4), activation="relu", padding="same"),
        layers.MaxPooling2D(4, 4),
        layers.Dropout(dropoutRate),
        layers.Flatten(),
        layers.Dense(1024, activation="relu"),
        layers.Dropout(dropoutRate),
        layers.Dense(512, activation="relu"),
        layers.Dropout(dropoutRate),
        layers.Dense(256, activation="relu"),
        layers.Dropout(dropoutRate),
        layers.Dense(128, activation="relu"),
        layers.Dropout(dropoutRate)
    ])
        
    input_list.append(layers.Input(shape=(img_size, img_size, channels), dtype=tf.float32))
    x = RandomAngleDistortion(flip=False)(input_list[-1])
    x = RandomColorDistortion(hue_flag=False)(x)
    x = layers.Rescaling(1.0/255.0)(x)
    concat_list.append(image_cnn_model(x))
    
    # categorical feature
    input_list.append(layers.Input(shape=1, dtype=tf.int32))
    tmp_unique = content["product_type_no"].unique()
    x = layers.IntegerLookup(vocabulary=tmp_unique[tmp_unique != -1].to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(content["product_type_no"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
#     input_list.append(layers.Input(shape=1, dtype=tf.string))
#     tmp_unique = content["product_group_name"].unique()
#     x = layers.StringLookup(vocabulary=tmp_unique[tmp_unique != "Unknown"].to_array(), output_mode="int")(input_list[-1])
#     dcn_list.append(layers.Embedding(len(content["product_group_name"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))

    input_list.append(layers.Input(shape=1, dtype=tf.int32))
    tmp_unique = content["graphical_appearance_no"].unique()
    x = layers.IntegerLookup(vocabulary=tmp_unique[tmp_unique != -1].to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(content["graphical_appearance_no"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
    input_list.append(layers.Input(shape=1, dtype=tf.int32))
    tmp_unique = content["colour_group_code"].unique()
    x = layers.IntegerLookup(vocabulary=tmp_unique[tmp_unique != -1].to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(content["colour_group_code"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
    input_list.append(layers.Input(shape=1, dtype=tf.string))
    tmp_unique = content["index_name"].unique()
    x = layers.StringLookup(vocabulary=tmp_unique[tmp_unique != "Unknown"].to_array(), output_mode="int")(input_list[-1])
    dcn_list.append(layers.Embedding(len(content["index_name"].unique())+1, 4, embeddings_initializer="glorot_normal")(x))
    
    # DCN(Deep Cross Network for feature interaction)
    x = layers.Concatenate(axis=1)(dcn_list)
    dcn1 = tfrs.layers.dcn.Cross()(x, layers.Dropout(dropoutRate)(x))
    dcn2 = tfrs.layers.dcn.Cross()(x, layers.Dropout(dropoutRate)(dcn1))

    concat_list.append(layers.Flatten()(x))
    concat_list.append(layers.Flatten()(dcn1))
    concat_list.append(layers.Flatten()(dcn2))  
  
    x = layers.Concatenate()(concat_list)
    block1_input = tfa.layers.NoisyDense(256, activity_regularizer="l2")(x)
    
    x = tfa.layers.WeightNormalization(
        layers.Dense(128, activation="relu")
    )(block1_input)
    x = layers.Dropout(dropoutRate)(x)

    block2_input = layers.Concatenate()([x, block1_input])
    x = tfa.layers.WeightNormalization(
        layers.Dense(128, activation="relu")
    )(block2_input)
    x = layers.Dropout(dropoutRate)(x)

    block3_input = layers.Concatenate()([x, block2_input])
    x = tfa.layers.WeightNormalization(
        layers.Dense(128, activation="relu")
    )(block3_input)
    x = layers.Dropout(dropoutRate)(x)
    
    x = tfa.layers.WeightNormalization(
        layers.Dense(64, activation="swish")
    )(x)
    
    final_embeddings = layers.Dense(4)(x)
    return Model(input_list, final_embeddings)

In [None]:
# create_model_content().summary()

In [None]:
# plot_model(create_model_content(), show_shapes=True)

**Defining Retrieval Model**

In [None]:
def _gather_elements_along_row(data: tf.Tensor,
                               column_indices: tf.Tensor) -> tf.Tensor:
  """Gathers elements from a 2D tensor given the column indices of each row.
  A more efficient way of gathering elements from 2D tensor than tf.gather_nd().
  First, gets the flat 1D indices to gather from. Then flattens the data to 1D
  and uses tf.gather() to generate 1D output and finnally reshapes the
  output back to 2D.
  Args:
    data: A [N, M] 2D `Tensor`.
    column_indices: A [N, K] 2D `Tensor` denoting for each row, the K column
      indices to gather elements from the data `Tensor`.
  Returns:
    A [N, K] `Tensor` including output elements gathered from data `Tensor`.
  Raises:
    ValueError: if the first dimensions of data and column_indices don't match.
  """
  with tf.control_dependencies(
      [tf.assert_equal(tf.shape(data)[0], tf.shape(column_indices)[0])]):
    num_row = tf.shape(data)[0]
    num_column = tf.shape(data)[1]
    num_gathered = tf.shape(column_indices)[1]
    row_indices = tf.tile(
        tf.expand_dims(tf.range(num_row), -1),
        [1, num_gathered])
    flat_data = tf.reshape(data, [-1])
    flat_indices = tf.reshape(
        row_indices * num_column + column_indices, [-1])
    return tf.reshape(
        tf.gather(flat_data, flat_indices), [num_row, num_gathered])

In [None]:
class Modified_Retrieval(tfrs.tasks.Retrieval):
  def __init__(self,
               loss=None,
               sample_weight=None,
               loss_topk_mean=None,
               metrics=None,
               batch_metrics=None,
               temperature=None,
               num_hard_negatives=None,
               name=None):
    
    super().__init__(loss=loss, metrics=metrics, batch_metrics=batch_metrics,
                    temperature=temperature, num_hard_negatives=num_hard_negatives, name=name)
    self.sample_weight = sample_weight
    self.loss_topk_mean = loss_topk_mean
    self.MAX_FLOAT = np.finfo('float32').max

  def call(self,
           query_embeddings,
           candidates_embeddings,
           metric_candidates_embeddings,
           sample_weight=None,
           candidate_sampling_probability=None,
           candidate_ids=None,
           compute_metrics=True):

    # === Retrieval Model ===
    # matmul for softmax (in batch sample)
    scores = tf.linalg.matmul(query_embeddings, candidates_embeddings, transpose_b=True)
    # using eye function for generating one-hot encoding vector (used on calculating logloss)
    scores_shape = tf.shape(scores)
    labels = tf.eye(scores_shape[0], scores_shape[1])
    
    metric_update_ops = []
    if compute_metrics:
        if self._factorized_metrics:
            metric_update_ops.append(
                self._factorized_metrics.update_state(query_embeddings, candidates_embeddings, metric_candidates_embeddings)
            )
        if self._batch_metrics:
            metric_update_ops.extend([
                batch_metric.update_state(labels, scores)
                for batch_metric in self._batch_metrics
            ])

    if self._temperature is not None:
        scores = scores / self._temperature

    if candidate_sampling_probability is not None:
        scores = tfrs.layers.loss.SamplingProbablityCorrection()(scores, candidate_sampling_probability)

    if candidate_ids is not None:
        scores = tfrs.layers.loss.RemoveAccidentalHits()(labels, scores, candidate_ids)

    if self._num_hard_negatives is not None:
        scores, labels = tfrs.layers.loss.HardNegativeMining(self._num_hard_negatives)(scores, labels)
    
    # average the top k logits (excluding my-self value, checking out the +1 operation below codes)
    if self.loss_topk_mean is not None:
        # mean top k logits and reshape to [pos, neg, neg ... neg]
        sorted_scores = _gather_elements_along_row(scores, tf.argsort(scores + labels * self.MAX_FLOAT, direction="DESCENDING"))
        scores = tf.concat([tf.math.reduce_mean(sorted_scores[:, :(self.loss_topk_mean+1)], axis=-1, keepdims=True), sorted_scores[:, (self.loss_topk_mean+1):]], axis=1)

        labels = tf.concat(
            [tf.ones((tf.shape(scores)[0], 1)),
             tf.zeros((tf.shape(scores)[0], tf.shape(scores)[1] - 1))], axis=1
        )

    # update loss
    loss = self._loss(y_true=labels, y_pred=scores, sample_weight=sample_weight)

    if not metric_update_ops:
        return loss

    with tf.control_dependencies(metric_update_ops):
        return tf.identity(loss)

In [None]:
class Modified_FactorizedTopK(tfrs.metrics.FactorizedTopK):
  """Computes metrics for across top K candidates surfaced by a retrieval model.
  The default metric is top K categorical accuracy: how often the true candidate
   is in the top K candidates for a given query.
  """
#   def __init__(self, metrics=None, k=100, all_candidates_embeddings=None, scann_leaves=1000, name="factorized_top_k"):
  def __init__(self, metrics=None, k=100, raw_scann=None, name="factorized_top_k"):
    super().__init__(candidates=None, metrics=metrics, k=k, name=name)
#     self._candidates = tfrs.layers.factorized_top_k.Streaming(sorted_order=False)
#     self.candidates_extractor = tfrs.layers.factorized_top_k.ScaNN(
#         num_leaves=scann_leaves,
#         num_leaves_to_search=max(int(scann_leaves*0.2), 10),
#         num_reordering_candidates=target_top_k*5,
#         k=target_top_k*3
#     )
#     self.candidates_extractor.index_from_dataset(all_candidates_embeddings)
    self.candidates_extractor = raw_scann

  def update_state(self,
                   query_embeddings,
                   candidates_embeddings,
                   metric_candidates_embeddings):

    positive_scores = tf.reduce_sum(query_embeddings * candidates_embeddings, axis=1, keepdims=True)
    # When initialization the retrieval model, parameter k is the value about the how many you wanna get dot product valeus on query embedding
    # When initialization the TopKCategoricalAccuracy(), parameter k is the value you wanna see whether this response on the query are in top k
#     self._candidates.index_from_dataset(metric_candidates_embeddings)
    top_k_predictions, _ = self.candidates_extractor(query_embeddings)
    
    # the label of scalar value from input query is 1, others are 0
    # tf.shape(positive_scores) : [batch_size, 1]
    # tf.shape(top_k_predictions) : [batch_size, K]
    y_labels = tf.concat(
        [tf.ones(tf.shape(positive_scores)),
         tf.zeros(tf.shape(top_k_predictions))],
        axis=1)
    y_scores = tf.concat([positive_scores, top_k_predictions], axis=1)

    update_ops = []
    for metric in self._top_k_metrics:
      update_ops.append(metric.update_state(y_true=y_labels, y_pred=y_scores))

    return tf.group(update_ops)

In [None]:
# tfrs.Model 클래스를 상속받아 모델 빌드
class RetrievalModel(tfrs.Model):
    def __init__(self, user_model, content_model, task):
        super().__init__()
        self.user_model = user_model
        self.content_model = content_model
        self.task = task
#         self.metric_candidates = metric_candidates
        self.compute_metrics = True
    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features[0][0])
        content_embeddings = self.content_model(features[0][1])
#         metric_candidates_embeddings = self.metric_candidates.map(lambda x1, x2, x3, x4, x5, x6: self.content_model((x1, x2, x3, x4, x5, x6)),
#                                                                   num_parallel_calls=True)
        metric_candidates_embeddings = None
        return self.task(user_embeddings, content_embeddings, metric_candidates_embeddings, compute_metrics=self.compute_metrics)

In [None]:
# Creating the dataset
def create_dataset(x, y=None, batch_size=None, shuffle=False):    
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.map(lambda x, y: ((x[0], (read_image(x[1][0]), x[1][1], x[1][2], x[1][3], x[1][4])), y),
                         num_parallel_calls=True)
#     dataset = dataset.cache()
    dataset = dataset.shuffle(int(batch_size * 2), reshuffle_each_iteration=True) if shuffle else dataset
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(2)
    return dataset
def tfrs_rating_search_index(users_ref, rating_users, contents_ref=None, rating_contents=None):
    # users_ref : indexes in rating table which are indicating the users feature in users table
    # contents_ref : indexes in rating table which are indicating the contents feature in contents table
    if rating_users is None:
        tmp_user = None
    else:
        tmp_user = [] 
        for i in tqdm(rating_users):
            try:
                tmp_user.append(users_ref.get_loc(i))
            except:
                tmp_user.append(nan)
                
    if rating_contents is None:
        tmp_content = None
    else:
        tmp_content = [] 
        for i in tqdm(rating_contents):
            try:
                tmp_content.append(contents_ref.get_loc(i))
            except:
                tmp_content.append(nan)
    return tmp_user, tmp_content
def read_image(input_image, img_size=128, channels=3, resize=False):
    image = tf.io.read_file(input_image)
    image = tf.image.decode_image(image, channels=channels, dtype=tf.float32)
    image = tf.resize(image, [img_size, img_size]) if resize else tf.ensure_shape(image, [img_size, img_size, channels])
    return image
def get_top_k_accuracy(val_ds, model_retrieval, model_scann, top_k_seq):
    # get final validation score on entire candidates from established ScaNN model
    result_score = dict.fromkeys(top_k_seq, 0)
    
    query_embeddings = val_ds.map(lambda x, y: model_retrieval.user_model(x[0]))
    candidates_embeddings = val_ds.map(lambda x, y: model_retrieval.content_model(x[1]))

    positive_scores = tf.data.Dataset.zip((query_embeddings, candidates_embeddings))
    positive_scores = positive_scores.map(lambda x1, x2: tf.reduce_sum(x1 * x2, axis=1, keepdims=True)).unbatch().as_numpy_iterator()
    positive_scores = array([i for i in positive_scores])
    
    top_k_scores = val_ds.map(lambda x, y: model_scann(x[0], k=np.max(top_k_seq)-1)[0]).unbatch().as_numpy_iterator()
    top_k_scores = array([i for i in top_k_scores])

    y_labels = tf.concat(
        [tf.ones(positive_scores.shape),
         tf.zeros(top_k_scores.shape)],
        axis=1)
    y_scores = tf.concat([positive_scores, top_k_scores], axis=1)
    
    for i in top_k_seq:
        tmp_metric = tf.keras.metrics.TopKCategoricalAccuracy(k=i, name="top_" + str(i) + "_acc")
        tmp_metric.update_state(y_true=y_labels, y_pred=y_scores)
        result_score[i] = tmp_metric.result().numpy()
    return result_score
class EpochsMetricCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        super(EpochsMetricCallback, self).__init__()
    def on_epoch_begin(self, epoch, logs=None):
        self.model.compute_metrics = False
    def on_test_begin(self, logs=None):
        self.model.compute_metrics = True

In [None]:
# rating = rating.sample(frac=1, random_state=114).reset_index(drop=True)
# stratVec = []
# for i in tqdm(rating["article_id"].to_array()):
#     try:
#         stratVec.append(content_ref.get_loc(i))
#     except:
#         stratVec.append("Unknown")
# stratVec = content["product_type_no"].iloc[stratVec].to_array()

In [None]:
# rating = rating.sample(frac=1, random_state=114).reset_index(drop=True)
# stratVec = []
# for i in tqdm(rating["article_id"].to_array()):
#     try:
#         stratVec.append(content_ref.get_loc(i))
#     except:
#         stratVec.append("Unknown")
# stratVec = content["index_name"].iloc[stratVec].to_array()

**get target column which is used as ranking label**

In [None]:
# rating["target"] = rating
# rating.drop(["index_name", "cnt"], axis=1, inplace=True)

In [None]:
#### learning parameter setting
epochs = 20
patient_epochs = 10
patient_lr = 1
eta = 1e-3
weight_decay = 1e-4
target_top_k = 12
scann_leaves = 1000

folder_path = "./"
checkpoint_filepath = './tmp_checkpoint/fold_checkpoint'

n_folds = 5
base_train_size = 1024
tr_size = base_train_size * 125
batch_size = 256
kfolds_spliter = StratifiedShuffleSplit(n_folds, train_size=tr_size, test_size=int(tr_size * 0.5), random_state=1)

# Training - 5 Folds

In [None]:
# def create_raw_scann():
#     raw_content_model = create_model_content()
    
#     tmp_ds = content.to_pandas()
#     tmp_ds = tf.data.Dataset.from_tensor_slices((
#         tmp_ds["article_id"],
#         tmp_ds[["product_type_no"]],
#         tmp_ds[["graphical_appearance_no"]],
#         tmp_ds[["colour_group_code"]],
#         tmp_ds[["index_name"]])
#     ).map(lambda x1, x2, x3, x4, x5: (read_image(x1), x2, x3, x4, x5), num_parallel_calls=True).batch(batch_size).prefetch(2)  
#     tmp_ds = tmp_ds.map(lambda x1, x2, x3, x4, x5: raw_content_model((x1, x2, x3, x4, x5)), num_parallel_calls=True)
    
#     model_scann = tfrs.layers.factorized_top_k.ScaNN(
#         num_leaves=scann_leaves,
#         num_leaves_to_search=max(int(scann_leaves*0.2), 10),
#         num_reordering_candidates=target_top_k*5,
#         k=target_top_k*3
#     )
#     model_scann.index_from_dataset(tmp_ds)
#     return model_scann
# raw_scann = create_raw_scann()
raw_scann = tf.keras.models.load_model("../input/h-m-recommendation-raw-scann/models_scann/raw_scann_10240/")

In [None]:
def do_fold_training(fold, train_idx, val_idx):
        tmp_time = time()

#         rating_train = rating.iloc[train_idx]
        rating_train = train_idx
        rating_train.dropna(inplace=True)
        rating_train.reset_index(drop=True, inplace=True)
        rating_train["customer_id"], rating_train["article_id"] = tfrs_rating_search_index(
            user_ref, rating_train["customer_id"].to_array(), content_ref, rating_train["article_id"].to_array()
        )    

        print("complete searching index on train")

#         rating_val = rating.iloc[val_idx]
        rating_val = val_idx
        rating_val.dropna(inplace=True)
        rating_val.reset_index(drop=True, inplace=True)
        rating_val["customer_id"], rating_val["article_id"] = tfrs_rating_search_index(
            user_ref, rating_val["customer_id"].to_array(), content_ref, rating_val["article_id"].to_array()
        )
        print("complete searching index on validation")

        minmax_scaler = MinMaxScaler()
        user_train_feature = user.iloc[rating_train.iloc[:,0].values].to_pandas()
        user_train_feature[["age"]] = minmax_scaler.fit_transform(user_train_feature[["age"]])
        user_train_feature = (
            user_train_feature[["customer_id"]],
            user_train_feature[["club_member_status"]],
            user_train_feature[["fashion_news_frequency"]],
            user_train_feature[["age_cat"]],
            user_train_feature[["FN", "Active", "age"]]
        )
        scaler_list.append(minmax_scaler)

        user_val_feature = user.iloc[rating_val.iloc[:,0].values].to_pandas()
        user_val_feature[["age"]] = minmax_scaler.transform(user_val_feature[["age"]])
        user_val_feature = (
            user_val_feature[["customer_id"]],
            user_val_feature[["club_member_status"]],
            user_val_feature[["fashion_news_frequency"]],
            user_val_feature[["age_cat"]],
            user_val_feature[["FN", "Active", "age"]]
        )

        content_train_feature = content.iloc[rating_train.iloc[:,1].values].to_pandas()
        content_train_feature = (
            content_train_feature["article_id"],
            content_train_feature[["product_type_no"]],
            content_train_feature[["graphical_appearance_no"]],
            content_train_feature[["colour_group_code"]],
            content_train_feature[["index_name"]]
        )
        content_val_feature = content.iloc[rating_val.iloc[:,1].values].to_pandas()
        content_val_feature = (
            content_val_feature["article_id"],
            content_val_feature[["product_type_no"]],
            content_val_feature[["graphical_appearance_no"]],
            content_val_feature[["colour_group_code"]],
            content_val_feature[["index_name"]]
        )

        train_ds = create_dataset((user_train_feature, content_train_feature), None, batch_size, True)
        val_ds = create_dataset((user_val_feature, content_val_feature), None, batch_size, False)
        del rating_train, rating_val, user_train_feature, user_val_feature, content_train_feature, content_val_feature
        gc.collect()
        
        cb_earlyStopping = tf_callbacks.EarlyStopping(patience=patient_epochs, monitor='val_total_loss', mode='min')
        cb_reduceLR = tf_callbacks.ReduceLROnPlateau(patience=patient_lr, factor=0.5, min_lr=1e-5)
        cb_modelsave = tf_callbacks.ModelCheckpoint(filepath=checkpoint_filepath, monitor='val_total_loss', mode='min', save_weights_only=True, save_best_only=True)
        cb_epochsmetric = EpochsMetricCallback()

        model_user = create_model_user()
        model_content = create_model_content()
                
        top_k_seq = [target_top_k, 100]
        top_k_metrics = [tf.keras.metrics.TopKCategoricalAccuracy(k=i, name="top_"+str(i)+"_acc") for i in top_k_seq]
        factorized_topk = Modified_FactorizedTopK(metrics=top_k_metrics, k=max(top_k_seq), raw_scann=raw_scann)
        
        task = Modified_Retrieval(
            loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM),
            metrics=factorized_topk,
#             operation like softmax temperature
            temperature=2,
            loss_topk_mean=4
        )

        model_retrieval = RetrievalModel(model_user, model_content, task)
        model_retrieval.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=eta)
#             optimizer=tfa.optimizers.AdamW(learning_rate=eta, weight_decay=weight_decay)
        )

        print("start training")
        model_history = model_retrieval.fit(train_ds, validation_data=val_ds, epochs=epochs, verbose=0,
                    callbacks=[cb_reduceLR, cb_earlyStopping, cb_modelsave, cb_epochsmetric, TqdmCallback(verbose=1)])
        model_retrieval.load_weights(checkpoint_filepath)
        print("end training") 
        model_history_list.append(model_history)

        print("Fold " + str(fold) + " Time (minutes) : ", round((time() - tmp_time) / 60, 3))
        return model_retrieval

In [None]:
def get_scann_model(fold, model_retrieval, candidates_corpus):
    print("start building search model")
    model_scann = tfrs.layers.factorized_top_k.ScaNN(
        model_retrieval.user_model,
        num_leaves=scann_leaves,
        num_leaves_to_search=max(int(scann_leaves*0.2), 10),
        num_reordering_candidates=target_top_k*5,
        k=scann_output_top_k
    )

    model_scann.index_from_dataset(
        candidates_corpus.map(lambda x1, x2, x3, x4, x5: model_retrieval.content_model((x1, x2, x3, x4, x5)), num_parallel_calls=True)
    )
    # register the model shape
    model_scann((array([[0]]), array([["0"]]), array([["0"]]), array([[0.0]]), array([[0.0, 0.0, 0.0]])))

#     fold_top_k_scores.append(get_top_k_accuracy(val_ds, model_retrieval, model_scann, top_k_seq)[target_top_k])

    model_scann_path = "./models_scann/fold_" + str(fold) + "/"
    tf.keras.models.save_model(
        model_scann,
        model_scann_path,
        options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
    )
    print("end building search model and save to local")
    return model_scann_path

**Train timeseries : 2018-09 ~ 2019-09**

**Validation timeseries : 2019-10 ~ 2020-09**

In [None]:
model_list = []
model_history_list = []
scaler_list = []
fold_top_k_scores = []
model_name = "tfrs"
scann_output_top_k = 128
seed_everything()

# fold training
# for fold, (train_idx, val_idx) in enumerate(kfolds_spliter.split(range(rating.shape[0]), stratVec)):
for fold in range(n_folds):
# for fold, (train_idx, val_idx) in enumerate(kfolds_spliter.split(range(rating.shape[0]))):
    print("\n===== Fold ", fold, "=====\n")
    mem_fold_start = memory_usage()
    # real data frame
    _, train_idx = tts(rating[train_mask], test_size=tr_size, stratify=stratVec[train_mask], random_state=fold+100)
    _, val_idx = tts(rating[~train_mask], test_size=int(tr_size * 0.5), stratify=stratVec[~train_mask], random_state=fold+101)

    model_list.append(do_fold_training(fold, train_idx, val_idx))
#     model_list.append(do_fold_training(fold, rating.iloc[train_idx], rating.iloc[val_idx]))
    tf.keras.backend.clear_session()
    gc.collect()
    mem_fold_end = memory_usage()
    print("@Memory leaked :", round(mem_fold_end - mem_fold_start, 3))

In [None]:
# model_list = []
# model_history_list = []
# scaler_list = []
# fold_top_k_scores = []
# model_name = "tfrs"
# scann_output_top_k = 128
# seed_everything()

# # fold training
# for fold, (train_idx, val_idx) in enumerate(kfolds_spliter.split(range(rating.shape[0]), stratVec)):
# # for fold, (train_idx, val_idx) in enumerate(kfolds_spliter.split(range(rating.shape[0]))):
#     print("\n===== Fold ", fold, "=====\n")
#     mem_fold_start = memory_usage()
#     model_list.append(do_fold_training(fold, train_idx, val_idx))
#     tf.keras.backend.clear_session()
#     gc.collect()
#     mem_fold_end = memory_usage()
#     print("@Memory leaked :", round(mem_fold_end - mem_fold_start, 3))

In [None]:
del rating; gc.collect()

In [None]:
model_scann_path = []

content = tf.data.Dataset.from_tensor_slices((
        content["article_id"].to_pandas(),
        content[["product_type_no"]].to_pandas(),
        content[["graphical_appearance_no"]].to_pandas(),
        content[["colour_group_code"]].to_pandas(),
        content[["index_name"]].to_pandas())
).map(lambda x1, x2, x3, x4, x5: (read_image(x1), x2, x3, x4, x5), num_parallel_calls=True).batch(batch_size).prefetch(2)

for fold in range(n_folds):
    model_scann_path.append(get_scann_model(fold, model_list[fold], content))

In [None]:
# for i in fold_top_k_scores:
#     print("fold top k score :", i)
# print("fold average score :", np.mean(fold_top_k_scores))

In [None]:
import matplotlib.pyplot as plt
def plot_history(histories, key='binary_crossentropy'):
  plt.figure(figsize=(16,10))

  for name, history in histories:
    val = plt.plot(history.epoch, history.history['val_'+key],
                   '--', label=name.title()+' Val')
#     plt.plot(history.epoch, history.history[key], color=val[0].get_color(),
#              label=name.title()+' Train')

  plt.xlabel('Epochs')
  plt.ylabel(key.replace('_',' ').title())
  plt.legend()

  plt.xlim([0,max(history.epoch)])

plt_input = [("fold_" + str(i), j) for i, j in enumerate(model_history_list)]

In [None]:
plot_history(plt_input, key="loss")

In [None]:
plot_history(plt_input, key="top_12_acc")

In [None]:
plot_history(plt_input, key="top_100_acc")

In [None]:
del content; gc.collect()

# Inference

In [None]:
rating_test = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv", dtype=["object"])
rating_test_submission = rating_test.copy()
rating_test["customer_id"] = rating_test["customer_id"].str[-16:].str.hex_to_int().astype('int64')
rating_test["customer_id"], _ = tfrs_rating_search_index(user_ref, rating_test["customer_id"].to_array())
score_counter = [Counter() for _ in range(rating_test.shape[0])]

In [None]:
def do_inference(fold):
    print("=== start fold_" + str(fold) + " inference ===")
    fold_time = time()
    model_loaded = tf.keras.models.load_model(model_scann_path[fold])
    rating_test_scaled = user.iloc[rating_test.iloc[:,0].values].to_pandas()
    
    rating_test_scaled[["age"]] = scaler_list[fold].transform(rating_test_scaled[["age"]])
    scores_table, labels_table = model_loaded((rating_test_scaled[["customer_id"]],
                                              rating_test_scaled[["club_member_status"]].astype(dtype="object"),
                                              rating_test_scaled[["fashion_news_frequency"]].astype(dtype="object"),
                                              rating_test_scaled[["age_cat"]].astype(dtype="float32"),
                                              rating_test_scaled[["FN", "Active", "age"]].astype(dtype="float32")))
    
    scores_table = scores_table.numpy() / n_folds
    labels_table = labels_table.numpy()
    print("start update score on samples")
    tmp_time = time()
    get_score_counter(score_counter, scores_table, labels_table, 1024)
    print("end update score on samples :", round(time() - tmp_time))
    print("=== end fold_" + str(fold) + " inference time :", round(time() - fold_time, 3), "===")
def get_score_counter(counter_obj, scores_table, labels_table, batch_size=1024):
    for idx, (scores, labels) in enumerate(tqdm(zip(scores_table, labels_table))):
        tmp_counter = counter_obj[idx]
        tmp_update_dic = {}
        tmp_batch_cnt = 0
        tmp_batch_size = batch_size
        for i, j in zip(content_ref[labels], scores):
            tmp_update_dic[i] = j
            if tmp_batch_cnt >= tmp_batch_size:
                tmp_counter.update(tmp_update_dic)
                tmp_batch_cnt = 0
                tmp_update_dic = {}
            else:
                tmp_batch_cnt += 1
        tmp_counter.update(tmp_update_dic)

In [None]:
for fold in range(n_folds):
    memory_usage()
    do_inference(fold)
    tf.keras.backend.clear_session()
    gc.collect()
    memory_usage()

In [None]:
tmp_labels = []
for i in tqdm(range(rating_test_submission.shape[0])):
    tmp_labels.append(" ".join([j[0] for j in score_counter[i].most_common(target_top_k)]))
rating_test_submission.iloc[:,1] = tmp_labels
rating_test_submission.to_csv("submission.csv", index=False)