In [1]:
import pandas as pd
import json
import gzip

In [2]:
def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield eval(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")


df = getDF("../data/meta_Beauty.json.gz")

In [3]:
file_name = "../data/reviews_Beauty_5.json"

unique_items = set()
unique_users = set()

with open(file_name, "r") as file:
    for line in file:
        review = json.loads(line.strip())
        unique_items.add(review["asin"])
        unique_users.add(review["reviewerID"])

print(f"Number of unique items: {len(unique_items)}")
print(f"Number of unique users: {len(unique_users)}")

Number of unique items: 12101
Number of unique users: 22363


In [5]:
df = df[df['asin'].isin(unique_items)]
len(df)

12101

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [14]:
def preprocess(row: pd.Series):
    row = row.fillna('empty') # unknown?
    # remove column description / title / cat?
    return f"Description: {row['description']}. Title: {row['title']}. Categories: {', '.join(row['categories'][0])}"


df["combined_text"] = df.apply(preprocess, axis=1)

In [16]:
def encode_text(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True).to(device)

    output = model.encoder(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        return_dict=True,
    )

    embeddings = output.last_hidden_state.mean(
        dim=1
    ).squeeze()  # mean over all tokens (mb CLS?)

    return embeddings.cpu().detach()

In [17]:
from tqdm import tqdm

tqdm.pandas()

with torch.no_grad():
    df["embeddings"] = df["combined_text"].progress_apply(encode_text)

100%|██████████| 12101/12101 [01:00<00:00, 199.89it/s]


In [18]:
df.head()

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand,combined_text,embeddings
115,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[[Beauty, Makeup, Face, Concealers & Neutraliz...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA,Description: An extensive range of 15 multiple...,"[tensor(-0.0002), tensor(0.0026), tensor(0.008..."
179,9759091062,Xtreme Brite Brightening gel is a highly conc...,Xtreme Brite Brightening Gel 1oz.,http://ecx.images-amazon.com/images/I/41QWW9v1...,{'Beauty': 52254},"[[Beauty, Hair Care, Styling Products, Creams,...",19.99,"{'also_bought': ['B0054GLD1U', 'B003BRZCUC', '...",Xtreme Brite,Description: Xtreme Brite Brightening gel is ...,"[tensor(0.0054), tensor(0.0238), tensor(-0.015..."
192,9788072216,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,Prada Candy By Prada Eau De Parfum Spray 1.7 O...,http://ecx.images-amazon.com/images/I/51iT2k6L...,{'Beauty': 78916},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",65.86,"{'also_bought': ['B006C5OHSI', 'B006P14842', '...",Prada,Description: Prada Candy By Prada Eau De Parfu...,"[tensor(-0.0383), tensor(0.0212), tensor(-0.01..."
555,9790790961,Versace Bright Crystal Perfume for Women 3 oz ...,Versace Bright Crystal Eau de Toilette Spray f...,http://ecx.images-amazon.com/images/I/418LYGLE...,{'Beauty': 764},"[[Beauty, Fragrance, Women's, Eau de Toilette]]",52.33,"{'also_bought': ['B007P7OPQQ', 'B0017JT658', '...",Versace,Description: Versace Bright Crystal Perfume fo...,"[tensor(0.0284), tensor(0.0173), tensor(0.0334..."
587,9790794231,STELLA For Women By STELLA MCCARTNEY 1.7 oz ED...,Stella McCartney Stella,http://ecx.images-amazon.com/images/I/31L2n60J...,{'Beauty': 142503},"[[Beauty, Fragrance, Women's, Eau de Parfum]]",,"{'also_bought': ['B0019M21OQ', 'B000E7YM8K', '...",,Description: STELLA For Women By STELLA MCCART...,"[tensor(0.0138), tensor(0.0021), tensor(0.0366..."


In [19]:
embs = torch.stack(df["embeddings"].tolist())

In [23]:
embs.shape

torch.Size([12101, 512])

In [24]:
import random

from rqvae import RQVAE


rqvae = RQVAE(
    input_dim=embs.shape[1],
    hidden_dim=128,
    beta=0.25,
    codebook_sizes=[256] * 4,
    should_init_codebooks=False,
    should_reinit_unused_clusters=False,
).to(device)


embs = {"embedding": embs.to(device)}

rqvae.forward(embs)

{'loss': tensor(0.0125, device='cuda:0', grad_fn=<MeanBackward0>),
 'recon_loss': tensor(0.0051, device='cuda:0'),
 'rqvae_loss': tensor(0.0074, device='cuda:0'),
 'unique/0': 31,
 'unique/1': 79,
 'unique/2': 46,
 'unique/3': 102}

In [29]:
def get_cb_tuples(embeddings):
    ind_lists = []
    for cb in rqvae.codebooks:
        dist = torch.cdist(rqvae.encoder(embeddings), cb)
        ind_lists.append(dist.argmin(dim=-1).cpu().numpy())

    return zip(*ind_lists)


def search_similar_items(items_with_tuples, clust2search):
    random.shuffle(items_with_tuples)
    cnt = 0
    similars = []
    for item, clust_tuple in items_with_tuples:
        if clust_tuple[: len(clust2search)] == clust2search:
            similars.append((item, clust_tuple))
            cnt += 1
        if cnt >= 5:
            return similars
    return similars

In [30]:
cb_tuples = get_cb_tuples(embs["embedding"])
items_with_tuples = list(zip(df["title"], cb_tuples))

In [None]:
for i in range(100, 120):
    sim = search_similar_items(items_with_tuples, (i,))
    if len(sim) == 0:
        continue
    print(i)
    for item, clust_tuple in sim:
        print(f"{item=} {clust_tuple=}")
        
# TODO fix collisisons (remainder = last embedding, auto-increment 4th id)

104
item='Vasanti Cosmetics Brighten Up! Enzymatic Face Rejuvenator with Microderm Exfoliating Crystals - Treats Dull, Uneven Skintone' clust_tuple=(104, 249, 227, 132)
item='TEI SPA Oxyderm High Frequency Ozone Facial Tool' clust_tuple=(104, 249, 227, 136)
item='Jan Marini Benzoyl Peroxide 2.5%-8 oz' clust_tuple=(104, 142, 227, 132)
111
item='Babyliss Pro BABNT5548 2000 Watt Ionic Nano Titanium with Integrated Ion Generator Hair Dryer' clust_tuple=(111, 132, 227, 136)
112
item='10 Pcs Wearable Nail Soaker Acrylic Polish Remover Tool' clust_tuple=(112, 249, 227, 215)
item='IBD Just Gel JUPITER BLUE Soak Off Blue Green Nail Polish UV Manicure .5oz Salon' clust_tuple=(112, 249, 227, 136)
item='Ladies Beauty Box 6 Wheels Combo Set Nail Art Nailart Manicure Rhinestones Glitter Tips Deco + 2x Dotting Pen + Glue' clust_tuple=(112, 58, 227, 215)
item='Nail Soakers - 10pcs' clust_tuple=(112, 249, 227, 136)
item='IBD Just Gel GERBER DAISY Soak Off Pink Nail Polish UV Manicure Pedi .5 oz Salon' 

In [None]:
# 1 2 3 0
# 1 2 3 1
# 4 5 6 0/2
# 4 5 6 1/3

# Research last index aggregation

# 1) last index = KMeans(last residuals, n=|last codebook|) - collision
# 2) auto increment last index (check paper)
# 3) decoder
# 4) [(1 2 3), (1 2 3)] single item -> ok
# 4.1) several -> get embeddings -> score. softmax(collisions), torch.logsoftmax(logits) -> score -> argmax

In [None]:
# pos emb for item & codebook (000 111 222) - item
# codebook (012 012 012)
# splitting item ?

In [50]:
torch.save(df, "../data/df_with_embs.pt")

In [47]:
!ls -lh ../data

total 361M
-rw-r--r-- 1 peter peter  53M дек 15 16:23 df_with_embs.pt
-rw-r--r-- 1 peter peter 154K дек 12 23:55 item_mapping.pkl
-rw-r--r-- 1 peter peter  95M дек 11 22:19 meta_Beauty.json.gz
-rw-r--r-- 1 peter peter  79M дек 12 23:37 ratings_Beauty.csv
-rw-r--r-- 1 peter peter 135M дек 15 15:33 reviews_Beauty_5.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
