# MAKE DATA

In [1]:
import os
import random
import pickle
import json
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from huggingface_hub import HfApi, snapshot_download, hf_hub_download
from PIL import Image
from tqdm.notebook import tqdm

In [None]:
# # download hm data from kaggle
# !kaggle competitions download -c h-and-m-personalized-fashion-recommendations

In [None]:
!unzip ./h-and-m-personalized-fashion-recommendations.zip -d ./raw

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


seed = 42
seed_everything(42)

In [3]:
def dump_pickle(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)


def load_pickle(path):
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data


def dump_json(data, path):
    with open(path, "w") as file:
        json.dump(data, file)


def load_json(path):
    with open(path, "r") as file:
        data = json.load(file)
    return data


def save_pt(data, path):
    with open(path, "wb") as file:
        torch.save(data, file)

In [None]:
def mk_dir(file_path):
    if not os.path.exists(file_path):
        os.makedirs(file_path)


def get_timestamp(date_format: str = "%d%H%M%S") -> str:
    timestamp = datetime.now()
    return timestamp.strftime(date_format)


n_core = 15
data_dir = f"./data/HM"
mk_dir(data_dir)

In [None]:
item_data = pd.read_csv(f"{data_dir}/articles.csv")
interaction_data = pd.read_csv(f"{data_dir}/transactions_train.csv")

In [None]:
is_equal = interaction_data.equals(
    interaction_data.sort_values(by=["t_dat", "customer_id"], axis=0).reset_index(
        drop=True
    )
)

In [None]:
drop_idx = []

for idx in tqdm(range(1, len(interaction_data))):
    if interaction_data.iloc[idx - 1].equals(interaction_data.iloc[idx]):
        drop_idx.append(idx)

interaction_data = interaction_data.drop(index=drop_idx).reset_index(drop=True)

In [None]:
n_item_data = item_data.dropna(axis=0, how="any").reset_index(drop=True)

In [None]:
def img_by_id(df, article_id: int, no_list: list, echo: int = 1, img_show: bool = True):
    if article_id in no_list:
        return
    if echo:
        display(df[df.article_id == article_id])

    img_id = "0" + str(article_id)
    img = Image.open(f"{data_dir}/images/" + img_id[0:3] + "/" + img_id + ".jpg")

    if img_show:
        img.show()


def find_no_img_item(df):
    no_img = []

    for item in tqdm(df.iterrows(), total=len(df)):
        try:
            img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
        except:
            no_img.append(item[0])

    return no_img

In [None]:
no_img_idx = find_no_img_item(n_item_data)

In [None]:
print("# of non-img item : ", len(no_img_idx))

In [None]:
n_item_data = n_item_data.drop(index=no_img_idx, axis=0).reset_index(drop=True)
print("shape of n_item_data : ", n_item_data.shape)

In [None]:
n_interaction_data = interaction_data[
    interaction_data["article_id"].isin(n_item_data["article_id"])
].reset_index(drop=True)
print("shape of interaction data : ", interaction_data.shape)
print("shape of n_interaction_data : ", n_interaction_data.shape)

In [None]:
def core_checker(df, group, target, threshold):
    counter = df.groupby(group)[target].nunique()
    valid = counter[counter >= threshold].index
    return df[df[group].isin(valid)]


def data_cutter(origin_data, u_core, i_core):
    print("### before ###")
    print("shape of n_interaction_data : ", origin_data.shape)

    while True:
        new_data = core_checker(origin_data, "customer_id", "article_id", u_core)
        new_data = core_checker(new_data, "article_id", "customer_id", i_core)

        if new_data.equals(origin_data):
            print("finish")
            break

        origin_data = new_data

    print("### after all item sampled ###")
    print(f"### user_core : {u_core}, item_core : {i_core} ###")
    print("shape of n_interaction_data : ", new_data.shape)
    print("num of user : ", new_data.customer_id.nunique())
    print("num of item : ", new_data.article_id.nunique())
    print(
        "data density : ",
        new_data.shape[0]
        / (new_data.customer_id.nunique() * new_data.article_id.nunique())
        * 100,
        "%",
    )

    return new_data

In [None]:
user_core = 15
item_core = 10
core_inter_data = data_cutter(n_interaction_data, 15, 10)

In [None]:
new_interaction_data = core_inter_data

In [None]:
metadata = {
    "shape of interaction data": new_interaction_data.shape,
    "user_core": user_core,
    "item_core": item_core,
    "shape of unique_data": new_interaction_data.shape,
    "num of user": new_interaction_data.customer_id.nunique(),
    "num of item": new_interaction_data.article_id.nunique(),
    "data density": f"{new_interaction_data.shape[0]/(new_interaction_data.customer_id.nunique()*new_interaction_data.article_id.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/metadata.json")

In [None]:
n_item_data = n_item_data[
    n_item_data["article_id"].isin(new_interaction_data["article_id"])
].reset_index(drop=True)

In [None]:
from fashion_clip.fashion_clip import FashionCLIP

fclip = FashionCLIP("fashion-clip")

In [None]:
images = [
    f"{data_dir}/images/" + "0" + str(k)[0:2] + "/" + "0" + str(k) + ".jpg"
    for k in n_item_data["article_id"].tolist()
]

In [None]:
image_fclip = fclip.encode_images(images, batch_size=700)

In [None]:
# {article_id : emb}
id_img_emb_map = {
    k: torch.tensor(v) for k, v in zip(n_item_data["article_id"].tolist(), image_fclip)
}

In [None]:
n_item_data["prod_name"] = n_item_data["prod_name"].fillna(" ")
n_item_data["detail_desc"] = n_item_data["detail_desc"].fillna(" ")
n_item_data["colour_group_name"] = n_item_data["colour_group_name"].fillna(" ")
n_item_data["graphical_appearance_name"] = n_item_data[
    "graphical_appearance_name"
].fillna(" ")

n_item_data["prod_name"] = n_item_data["prod_name"].replace("Unknown", " ")
n_item_data["detail_desc"] = n_item_data["detail_desc"].replace("Unknown", " ")
n_item_data["colour_group_name"] = n_item_data["colour_group_name"].replace(
    "Unknown", " "
)
n_item_data["graphical_appearance_name"] = n_item_data[
    "graphical_appearance_name"
].replace("Unknown", " ")

In [None]:
n_item_data["desc"] = n_item_data.apply(
    lambda x: f"{x['detail_desc'][:100]} {x["colour_group_name"][:100]} {x['graphical_appearance_name'][:100]}",
    axis=1,
)

In [None]:
texts = n_item_data["desc"].tolist()

In [None]:
text_fclip = fclip.encode_text(texts, batch_size=64)

In [None]:
# {article_id : emb}
id_text_emb_map = {
    k: torch.tensor(v) for k, v in zip(n_item_data["article_id"].tolist(), text_fclip)
}

In [None]:
user2idx = {
    v: k for k, v in enumerate(new_interaction_data["customer_id"].unique())
}  # {customer_id:idx}
item2idx = {
    v: k for k, v in enumerate(n_item_data["article_id"].unique())
}  # {item_id:idx}

print("# of user", len(user2idx))
print("# of item", len(item2idx))

torch.save(item2idx, f"{data_dir}/item2idx.pt")
torch.save(user2idx, f"{data_dir}/user2idx.pt")

In [None]:
idx_img_emb_map = {
    item2idx[row["article_id"]]: id_img_emb_map[row["article_id"]]
    for _, row in tqdm(n_item_data.iterrows(), total=len(n_item_data))
}
idx_text_emb_map = {
    item2idx[row["article_id"]]: id_text_emb_map[row["article_id"]]
    for _, row in tqdm(n_item_data.iterrows(), total=len(n_item_data))
}

In [None]:
torch.save(idx_img_emb_map, f"{data_dir}/idx_img_emb_map.pt")
torch.save(idx_text_emb_map, f"{data_dir}/idx_text_emb_map.pt")

In [None]:
idx_meta_map = {
    item2idx[row["article_id"]]: row["text"] for _, row in n_item_data.iterrows()
}

In [None]:
n_item_data = n_item_data[["article_id"]]
new_interaction_data = new_interaction_data[["customer_id", "article_id"]]

In [None]:
new_interaction_data["customer_id"] = new_interaction_data["customer_id"].map(user2idx)
new_interaction_data["article_id"] = new_interaction_data["article_id"].map(item2idx)
n_item_data["article_id"] = n_item_data["article_id"].map(item2idx)

#### train/valid/test split

In [None]:
unique_data = new_interaction_data.drop_duplicates(
    ["article_id", "customer_id"], keep="last"
)

In [None]:
metadata = {
    "shape of interaction data": unique_data.shape,
    "user_core": 15,
    "item_core": 10,
    "num of user": unique_data.customer_id.nunique(),
    "num of item": unique_data.article_id.nunique(),
    "data density": f"{unique_data.shape[0]/(unique_data.customer_id.nunique()*unique_data.article_id.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/uniqued_metadata.json")

In [None]:
tqdm.pandas()
test_data = dict(unique_data.groupby("customer_id")["article_id"].progress_apply(list))

In [None]:
test_data = [v for v in test_data.values()]
test_data

In [None]:
torch.save(test_data, f"{data_dir}/uniqued_test_data.pt")