In [1]:
import pandas as pd
import json
import gzip

In [2]:
def parse(path):
    g = gzip.open(path, "rb")
    for line in g:
        yield eval(line)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")


df = getDF("../data/meta_Beauty.json.gz")

In [None]:
file_name = "../data/reviews_Beauty_5.json"

unique_items = set()
unique_users = set()

with open(file_name, "r") as file:
    for line in file:
        review = json.loads(line.strip())
        unique_items.add(review["asin"])
        unique_users.add(review["reviewerID"])

print(f"Number of unique items: {len(unique_items)}")
print(f"Number of unique users: {len(unique_users)}")

In [None]:
df = df[df["asin"].isin(unique_items)]
len(df)

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [14]:
def preprocess(row: pd.Series):
    row = row.fillna("unknown")  # empty?
    # remove column description / title / cat?
    return f"Description: {row['description']}. Title: {row['title']}. Categories: {', '.join(row['categories'][0])}"


df["combined_text"] = df.apply(preprocess, axis=1)

In [16]:
def encode_text(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True).to(device)

    output = model.encoder(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        return_dict=True,
    )

    embeddings = output.last_hidden_state.mean(
        dim=1
    ).squeeze()  # mean over all tokens (mb CLS?)

    return embeddings.cpu().detach()

In [None]:
from tqdm import tqdm

tqdm.pandas()

with torch.no_grad():
    df["embeddings"] = df["combined_text"].progress_apply(encode_text)

In [None]:
df.head()

In [19]:
embs = torch.stack(df["embeddings"].tolist())

In [None]:
embs.shape

In [None]:
import random

from rqvae import RQVAE


rqvae = RQVAE(
    input_dim=embs.shape[1],
    hidden_dim=128,
    beta=0.25,
    codebook_sizes=[256] * 4,
    should_init_codebooks=False,
    should_reinit_unused_clusters=False,
).to(device)


embs = {"embedding": embs.to(device)}

rqvae.forward(embs)

In [29]:
def get_cb_tuples(embeddings):
    ind_lists = []
    for cb in rqvae.codebooks:
        dist = torch.cdist(rqvae.encoder(embeddings), cb)
        ind_lists.append(dist.argmin(dim=-1).cpu().numpy())

    return zip(*ind_lists)


def search_similar_items(items_with_tuples, clust2search):
    random.shuffle(items_with_tuples)
    cnt = 0
    similars = []
    for item, clust_tuple in items_with_tuples:
        if clust_tuple[: len(clust2search)] == clust2search:
            similars.append((item, clust_tuple))
            cnt += 1
        if cnt >= 5:
            return similars
    return similars

In [30]:
cb_tuples = get_cb_tuples(embs["embedding"])
items_with_tuples = list(zip(df["title"], cb_tuples))

In [None]:
for i in range(100, 120):
    sim = search_similar_items(items_with_tuples, (i,))
    if len(sim) == 0:
        continue
    print(i)
    for item, clust_tuple in sim:
        print(f"{item=} {clust_tuple=}")

# TODO fix collisisons (remainder = last embedding, auto-increment 4th id)

In [None]:
# 1 2 3 0
# 1 2 3 1
# 4 5 6 0/2
# 4 5 6 1/3

# Research last index aggregation

# 1) last index = KMeans(last residuals, n=|last codebook|) - collision
# 2) auto increment last index (check paper)
# 3) decoder
# 4) [(1 2 3), (1 2 3)] single item -> ok
# 4.1) several -> get embeddings -> score. softmax(collisions), torch.logsoftmax(logits) -> score -> argmax

In [None]:
# pos emb for item & codebook (000 111 222) - item
# codebook (012 012 012)
# splitting item ?

In [50]:
torch.save(df, "../data/df_with_embs.pt")

In [None]:
!ls -lh ../data