In [None]:
import os
import random
import pickle
import json
from datetime import datetime
import numpy as np
import torch
from tqdm.notebook import tqdm
import pandas as pd

tqdm.pandas()

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed = 42
seed_everything(42)

In [None]:
def load_pickle(path):
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data


def dump_json(data, path):
    with open(path, "w") as file:
        json.dump(data, file)

In [None]:
def mk_dir(file_path):
    if not os.path.exists(file_path):
        os.makedirs(file_path)


def get_timestamp(date_format: str = "%d%H%M%S") -> str:
    timestamp = datetime.now()
    return timestamp.strftime(date_format)


In [None]:
n_core = 5
data_dir = f"./data/home"
mk_dir(data_dir)

#### prepare

In [None]:
import pandas as pd
import gzip


def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield eval(l)


def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")


# download raw data from https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html
meta_df = getDF("./meta_Home_and_Kitchen.json.gz") 
inter_df = getDF("./reviews_Home_and_Kitchen_5.json.gz")

In [None]:
inter_df.columns

In [None]:
meta_df.columns

In [None]:
meta_df = meta_df[meta_df.asin.isin(inter_df.asin.unique())].reset_index(drop=True)

In [None]:
inter_df.reviewerID.nunique()
inter_df.asin.nunique()

In [None]:
meta_df.shape

In [None]:
temp = meta_df[["imUrl", "asin"]]

In [None]:
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from PIL import Image
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


def create_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


session = create_session()
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
no_img_list = []


def download_and_save_image(row, img_path):
    try:
        images = row.imUrl
        parent_asin = row.asin

        response = session.get(images, headers=headers, timeout=10)
        response.raise_for_status() 

        img = Image.open(BytesIO(response.content)).convert("RGB")

        img.save(f"{img_path}/{parent_asin}.jpg", "JPEG")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {row.asin}: {e}, save")
        no_img_list.append(row.asin)
        torch.save(no_img_list, f"{data_dir}/no_img.pt")
    except Exception as e:
        print(f"Error processing {row.asin}: {e}, save")
        no_img_list.append(row.asin)
        torch.save(no_img_list, f"{data_dir}/no_img.pt")


# Image save path
img_path = f"{data_dir}/images"
mk_dir(img_path)

with ThreadPoolExecutor(max_workers=100) as executor:
    for _ in tqdm(
        executor.map(
            lambda row: download_and_save_image(row, img_path),
            meta_df[["imUrl", "asin"]].itertuples(index=False),
        ),
        total=len(meta_df),
    ):
        pass

In [None]:
no_img_data = meta_df[meta_df["asin"].isin(no_img_list)]
with ThreadPoolExecutor() as executor:
    for _ in tqdm(
        executor.map(
            lambda row: download_and_save_image(row, img_path),
            no_img_data[["imUrl", "asin"]].itertuples(index=False),
        ),
        total=len(no_img_data),
    ):
        pass

In [None]:
meta_df = meta_df[~meta_df["asin"].isin(no_img_list)].reset_index(drop=True)
meta_df.shape

In [None]:
ava_data = inter_df[inter_df["asin"].isin(meta_df["asin"])].reset_index(drop=True)

#### k-core_data

In [None]:
meta_data = meta_df
inter_data = ava_data

In [None]:
print(meta_data.columns)
print(meta_data.shape)
print(inter_data.shape)

#####

In [None]:
def core_checker(df, group, target, threshold):
    counter = df.groupby(group)[target].nunique()
    valid = counter[counter >= threshold].index
    return df[df[group].isin(valid)]


def data_cutter(origin_data, user_core, item_core):
    print("### before ###")
    print("shape of n_interaction_data : ", origin_data.shape)

    while True:
        new_data = core_checker(origin_data, "reviewerID", "asin", user_core)
        new_data = core_checker(new_data, "asin", "reviewerID", item_core)

        if new_data.equals(origin_data):
            print("finish")
            break

        origin_data = new_data

    print("### after all item sampled ###")
    print(f"### user_core : {user_core}, item_core : {item_core} ###")
    print("shape of n_interaction_data : ", new_data.shape)
    print("num of user : ", new_data.reviewerID.nunique())
    print("num of item : ", new_data.asin.nunique())
    print(
        "data density : ",
        new_data.shape[0]
        / (new_data.reviewerID.nunique() * new_data.asin.nunique())
        * 100,
        "%",
    )

    return new_data

In [None]:
core_inter_data = data_cutter(inter_data, 5, 5)

### before ###
shape of n_interaction_data :  (550457, 9)
finish
### after all item sampled ###
### user_core : 5, item_core : 5 ###
shape of n_interaction_data :  (548769, 9)
num of user :  66167
num of item :  28125
data density :  0.02948869778993557 %


In [None]:
core_inter_data.to_csv(f"./{data_dir}/{n_core}_core_inter_data.csv", index=False)

In [None]:
core_meta_data = meta_data[meta_data["asin"].isin(core_inter_data["asin"])].reset_index(
    drop=True
)

In [None]:
core_meta_data.to_csv(f"./{data_dir}/{n_core}_core_meta_data.csv", index=False)

In [None]:
print("shape of n_item_data : ", core_meta_data.shape)
print("shape of new_interaction_data : ", core_inter_data.shape)

In [None]:
metadata = {
    "shape of interaction data": core_inter_data.shape,
    "user_core": 5,
    "item_core": 5,
    "shape of meta_data": core_meta_data.shape,
    "num of user": core_inter_data.reviewerID.nunique(),
    "num of item": core_inter_data.asin.nunique(),
    "data density": f"{core_inter_data.shape[0]/(core_inter_data.reviewerID.nunique()*core_inter_data.asin.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/metadata.json")

MMdata

In [None]:
new_interaction_data = pd.read_csv(f"{data_dir}/{n_core}_core_inter_data.csv")
n_item_data = pd.read_csv(f"{data_dir}/{n_core}_core_meta_data.csv")

print("shape of n_item_data : ", n_item_data.shape)
print("shape of new_interaction_data : ", new_interaction_data.shape)

In [None]:
from fashion_clip.fashion_clip import FashionCLIP

fclip = FashionCLIP("fashion-clip")

In [None]:
images = (
    n_item_data["asin"]
    .progress_apply(lambda x: f"{data_dir}/images/{x}.jpg")
    .to_list()
)

In [None]:
image_fclip = fclip.encode_images(images, batch_size=700)

In [None]:
# {article_id : emb}
id_img_emb_map = {
    k: torch.tensor(v) for k, v in zip(n_item_data["asin"].tolist(), image_fclip)
}

In [None]:
n_item_data["categories"][0]

In [None]:
n_item_data["categories_f"] = n_item_data["categories"].progress_apply(
    lambda x: eval(x)
)
n_item_data["categories_f"] = n_item_data["categories_f"].progress_apply(
    lambda x: x[0] if isinstance(x, list) else x
)

In [None]:
cat_list = [y for x in n_item_data["categories_f"] for y in x]

In [None]:
cat_list

In [None]:
from collections import Counter

Counter(cat_list).most_common(1)

In [None]:
for idx, row in tqdm(n_item_data.iterrows(), total=len(n_item_data)):
    if "Home & Kitchen" in n_item_data.at[idx, "categories_f"]:
        n_item_data.at[idx, "categories_f"].remove("Home & Kitchen")

In [None]:
# for idx, row in tqdm(n_item_data.iterrows(), total=len(n_item_data)):
#     if "Clothing, Shoes & Jewelry" in n_item_data.at[idx, "categories_f"]:
#         n_item_data.at[idx, "categories_f"].remove("Clothing, Shoes & Jewelry")

In [None]:
n_item_data["categories_f"] = n_item_data["categories_f"].apply(
    lambda x: x if len(x) else ["Unknown"]
)

In [None]:
n_item_data["description"] = n_item_data["description"].fillna(" ")
n_item_data["title"] = n_item_data["title"].fillna(" ")
n_item_data["brand"] = n_item_data["brand"].fillna(" ")

In [None]:
sentences = []
for i, row in n_item_data.iterrows():
    sen = row["title"] + " " + row["brand"] + " "
    cates = eval(row["categories"])
    if isinstance(cates, list):
        for c in cates[0]:
            sen = sen + c + " "
    sen += row["description"]
    sen = sen.replace("\n", " ")

    sentences.append(sen)

In [None]:
n_item_data["sentences"] = sentences

In [None]:
t_fclip = fclip.encode_text(n_item_data["sentences"], batch_size=128)

In [None]:
id_text_emb_map_t = {
    k: torch.tensor(v) for k, v in zip(n_item_data["asin"].tolist(), t_fclip)
}

In [None]:
user2idx = {
    v: k for k, v in enumerate(new_interaction_data["reviewerID"].unique())
}  # {reviewerIDrID:idx}
item2idx = {v: k for k, v in enumerate(n_item_data["asin"].unique())}  # {item_id:idx}


print("# of user", len(user2idx))
print("# of item", len(item2idx))

torch.save(item2idx, f"{data_dir}/item2idx.pt")
torch.save(user2idx, f"{data_dir}/user2idx.pt")

In [None]:
idx_img_emb_map = {
    item2idx[row["asin"]]: id_img_emb_map[row["asin"]]
    for _, row in tqdm(n_item_data.iterrows(), total=len(n_item_data))
}
idx_text_emb_map = {
    item2idx[row["asin"]]: id_text_emb_map_t[row["asin"]]
    for _, row in tqdm(n_item_data.iterrows(), total=len(n_item_data))
}

In [None]:
torch.save(idx_img_emb_map, f"{data_dir}/idx_img_emb_map.pt")
torch.save(idx_text_emb_map, f"{data_dir}/idx_text_emb_map.pt")

In [None]:
sorted_iteraction_data = new_interaction_data.sort_values(
    ["reviewerID", "unixReviewTime"]
).reset_index(drop=True)
sorted_iteraction_data

In [None]:
sorted_iteraction_data["reviewerID"] = sorted_iteraction_data["reviewerID"].map(
    user2idx
)
sorted_iteraction_data["asin"] = sorted_iteraction_data["asin"].map(item2idx)
sorted_iteraction_data.head()

In [None]:
sorted_iteraction_data = sorted_iteraction_data[["reviewerID", "asin"]]

In [None]:
unique_data = sorted_iteraction_data.drop_duplicates(
    ["asin", "reviewerID"], keep="last"
)
min(unique_data.groupby([""]).count()["asin"])

In [None]:
metadata = {
    "shape of interaction data": unique_data.shape,
    "user_core": 5,
    "item_core": 5,
    "num of user": unique_data.reviewerID.nunique(),
    "num of item": unique_data.asin.nunique(),
    "data density": f"{unique_data.shape[0]/(unique_data.reviewerID.nunique()*unique_data.asin.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/uniqued_metadata.json")

In [None]:
test_data = dict(unique_data.groupby("reviewerID")["asin"].progress_apply(list))

In [None]:
test_data = [v for v in test_data.values()]
test_data

In [None]:
torch.save(test_data, f"{data_dir}/uniqued_test_data.pt")