In [None]:
import torch

from rqvae.rqvae_data import get_data

df = get_data()

In [2]:
embs = torch.stack(df["embeddings"].tolist())

In [3]:
import json
from utils import DEVICE
from models.base import BaseModel

config = json.load(open("../configs/train/tiger_train_config.json"))

batch_proc_config = config['dataloader']['train']['batch_processor']

rqvae_train_config = json.load(open(batch_proc_config['rqvae_train_config_path']))
rq_vae_config = rqvae_train_config['model']
rq_vae_config['should_init_codebooks'] = False

rqvae_model = BaseModel.create_from_config(rq_vae_config).to(DEVICE)

rqvae_model.load_state_dict(torch.load(batch_proc_config['rqvae_checkpoint_path'], weights_only=True))
rqvae_model.eval()

ids = df.index.tolist()

embs_dict = {"ids": torch.tensor(ids).to(DEVICE), "embeddings": embs.to(DEVICE)}

semantic_ids = list(rqvae_model.forward(embs_dict))

In [4]:
from rqvae.collisions import dedup

items_with_tuples = list(zip(df["asin"], df["title"].fillna("unknown"), semantic_ids))
items_with_tuples = dedup(items_with_tuples)

assert len(df) == len(set(item[-1] for item in items_with_tuples))

In [None]:
items_with_tuples

In [5]:
from trie import Trie

_trie = Trie()

for (id, tuple) in zip(df.index, semantic_ids):
    _trie.insert(tuple, id) # todo handle collisions, not overwrite

In [19]:
import pickle

with open("../data/Beauty/trie.pkl", 'wb') as f:
    pickle.dump(_trie, f)

In [None]:
from rqvae.rqvae_data import search_similar_items


for i in range(5):
    sim = search_similar_items(items_with_tuples, (i,), 5)
    if len(sim) == 0:
        continue
    print(i)
    for asin, item, clust_tuple in sim:
        # if 'shampoo' in item.lower():
        print(f"{item=} {clust_tuple=}")

In [None]:
from collections import Counter
import matplotlib.pyplot as plt


plt.hist(Counter(item[-1][:-1] for item in items_with_tuples).values())
plt.show()

In [18]:
# # raw full ids
# full_duplicates = Counter(item[-1][:-1] for item in items_with_tuples).items()
# duplicated = [(semantic_id, amount) for (semantic_id, amount) in full_duplicates if amount > 1]
# duplicated

In [None]:
# collison counters - (how many item have same full semantic id): amount of such sets
vals = Counter(item[-1][:-1] for item in items_with_tuples).values()
Counter(vals)

In [None]:
# dedup idxes
Counter(item[-1][4] for item in items_with_tuples)

In [11]:
# from sklearn import preprocessing

# labels = df['asin']

# le = preprocessing.LabelEncoder()
# targets = le.fit_transform(labels)

# df['asin_numeric'] = targets

# torch.save(df, './all_data.pt')

In [None]:
import torch
from utils import create_masked_tensor


embeddings = torch.rand((11, 2))
print(embeddings)

lengths = torch.tensor([3, 1, 2, 5])

padded_embeddings, mask = create_masked_tensor(embeddings, lengths)

In [None]:
padded_embeddings

In [None]:
mask

In [3]:
import torch

df = torch.load('../data/Beauty/data_full.pt')

In [9]:
torch.stack(df.sort_index().embeddings.tolist())

tensor([[ 0.0115, -0.0265, -0.0014,  ..., -0.0403, -0.0064, -0.0646],
        [ 0.0522,  0.0051, -0.0317,  ..., -0.0215, -0.0284,  0.0104],
        [ 0.0657, -0.0131, -0.0234,  ..., -0.0370, -0.0462, -0.0503],
        ...,
        [ 0.0432,  0.0406, -0.0517,  ..., -0.0368,  0.0365, -0.0342],
        [ 0.0523,  0.0635, -0.0556,  ..., -0.0422,  0.0264, -0.0267],
        [ 0.0603,  0.0612, -0.0532,  ..., -0.0728,  0.0288, -0.0492]])

In [10]:
df.sort_index()

Unnamed: 0_level_0,raw_user_id,raw_item_id,rating,timestamp,user_id,asin,description,title,imUrl,salesRank,categories,price,related,brand,combined_text,embeddings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,A20NUABVL6KKTV,B002OVV7F0,4.0,1344556800,651772,B002OVV7F0,Ten shades of eye shadow that features bold co...,"NYX Cosmetics Eye Shadow Palette 10 Color, Jaz...",http://ecx.images-amazon.com/images/I/41PVzq7M...,{'Beauty': 138235},"[[Beauty, Makeup, Eyes, Eye Shadow]]",7.97,"{'also_bought': ['B00B1ZPFT4', 'B00A5YDBWK', '...",NYX,Description: Ten shades of eye shadow that fea...,"[tensor(0.0115), tensor(-0.0265), tensor(-0.00..."
2,A2CNWQEZHQ3K6R,B006GQPZ8E,2.0,1392940800,952110,B006GQPZ8E,Color burst lip butter combines beautiful colo...,"REVLON Colorburst Lip Butter, Peach Parfait, 0...",http://ecx.images-amazon.com/images/I/31r1scO3...,{'Beauty': 12827},"[[Beauty, Makeup, Lips, Lipstick]]",6.35,"{'also_bought': ['B006GQTZ8A', 'B006GQEI0A', '...",Revlon,Description: Color burst lip butter combines b...,"[tensor(0.0522), tensor(0.0051), tensor(-0.031..."
3,A1MVAPY2WT4D4M,B0002DNZAC,4.0,1404086400,83086,B0002DNZAC,Buy MAC Eyeshadows - MAC Eye Shadow Frost Sati...,MAC Eye Shadow Frost Satin Taupe,http://ecx.images-amazon.com/images/I/41PNABCE...,{'Beauty': 107237},"[[Beauty, Makeup, Eyes, Eye Shadow]]",21.19,"{'also_bought': ['B0018HPFNG', 'B00BEH4UU4', '...",M.A.C,Description: Buy MAC Eyeshadows - MAC Eye Shad...,"[tensor(0.0657), tensor(-0.0131), tensor(-0.02..."
4,AO2GZG0N16FCD,B0000UTUVU,5.0,1384905600,36346,B0000UTUVU,Here's something you simply can't live without...,"Mrs. Meyer's Clean Day Dish Soap, Lavender, 16...",http://ecx.images-amazon.com/images/I/31H9M36F...,,"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",8.19,"{'also_bought': ['B0000UTUV0', 'B004ZY1J6G', '...",Mrs. Meyer&#39;s Clean Day,Description: Here's something you simply can't...,"[tensor(0.0452), tensor(0.0012), tensor(0.0066..."
5,ASNGEUJ0LCACM,B000F8HWXU,5.0,1320796800,27187,B000F8HWXU,Queen Helene has been a leader in quality beau...,"Queen Helene Mint Julep Masque, 2 Ounce Travel...",http://ecx.images-amazon.com/images/I/41YKB8lA...,{'Beauty': 10453},"[[Beauty, Skin Care, Face, Treatments & Masks,...",3.03,"{'also_bought': ['B0072CTONS', 'B00CYI3RAG', '...",Queen Helene,Description: Queen Helene has been a leader in...,"[tensor(0.0141), tensor(-0.0156), tensor(-0.03..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12097,A26ZA5ZV0BPRXK,B00LCEROA2,5.0,1404259200,1186930,B00LCEROA2,,Dr Song Rosehip Oil 4oz (4 oz),http://ecx.images-amazon.com/images/I/412qdoPc...,{'Beauty': 7597},"[[Beauty, Skin Care, Face, Oils & Serums]]",19.99,"{'also_bought': ['B00LNVW1IE', 'B00JYKGFWY', '...",,Description: empty. Title: Dr Song Rosehip Oil...,"[tensor(0.0204), tensor(0.0244), tensor(0.0625..."
12098,A3DXSM2289U79E,B00IBMV2ME,5.0,1392768000,1188037,B00IBMV2ME,The Best BOTANICAL HYALURONIC ACID (5.0%) Gel ...,Best Botanical Hyaluronic Acid Anti Aging Faci...,http://ecx.images-amazon.com/images/I/4171BmUV...,{'Beauty': 116649},"[[Beauty, Skin Care, Face, Oils & Serums]]",24.50,"{'also_bought': ['B00IC8JBIE', 'B00IC9AG5A', '...",,Description: The Best BOTANICAL HYALURONIC ACI...,"[tensor(0.0454), tensor(0.0540), tensor(-0.023..."
12099,A3DXSM2289U79E,B00IC9AG5A,5.0,1392768000,1188037,B00IC9AG5A,Announcing a Dermatologist Grade Skin Treatmen...,Anti Aging All In One Facial Treatment (Replac...,http://ecx.images-amazon.com/images/I/314b-jZn...,{'Beauty': 84262},"[[Beauty, Skin Care, Eyes, Combinations]]",26.50,"{'also_bought': ['B00IC8JBIE', 'B00IC7L3JK', '...",,Description: Announcing a Dermatologist Grade ...,"[tensor(0.0432), tensor(0.0406), tensor(-0.051..."
12100,A2BWXFJAQNH8LC,B00IKKORVU,5.0,1393632000,1188048,B00IKKORVU,Announcing The Ultimate Vitamin C Anti Aging S...,Best Vitamin C Anti Aging 6 Item System &amp; ...,http://ecx.images-amazon.com/images/I/51yIcFHj...,{'Beauty': 87595},"[[Beauty, Skin Care, Sets & Kits]]",125.00,"{'also_viewed': ['B00IC8JBIE', 'B00GYJWL7G', '...",,Description: Announcing The Ultimate Vitamin C...,"[tensor(0.0523), tensor(0.0635), tensor(-0.055..."
