In [1]:
import numpy as np
import polars as pl
import pandas as pd

In [2]:
path_to_df = '../data/Beauty/ratings_Beauty.csv'
df = pl.read_csv(
    path_to_df, 
    has_header=False, 
    new_columns=['user_id', 'item_id', 'rating', 'timestamp'], 
    separator=',',
    schema_overrides={
        "user_id": pl.String,
        "item_id": pl.String,
        "rating": pl.String,
        'timestamp': pl.UInt64
    }
)

In [3]:
df.head()

user_id,item_id,rating,timestamp
str,str,str,u64
"""A39HTATAQ9V7YF""","""0205616461""","""5.0""",1369699200
"""A3JM6GV9MNOF9X""","""0558925278""","""3.0""",1355443200
"""A1Z513UWSAAO0F""","""0558925278""","""5.0""",1404691200
"""A1WMRR494NWEWV""","""0733001998""","""4.0""",1382572800
"""A3IAAVS479H7M7""","""0737104473""","""1.0""",1274227200


In [4]:
filtering_stage = 0
is_changed = True
threshold = 5
good_users = set()
good_items = set()

filtered_df = df.clone()

while is_changed:
    user_counts = filtered_df.group_by("user_id").agg(pl.len().alias("user_count"))
    item_counts = filtered_df.group_by("item_id").agg(pl.len().alias("item_count"))

    good_users = user_counts.filter(pl.col("user_count") >= threshold).select("user_id")
    good_items = item_counts.filter(pl.col("item_count") >= threshold).select("item_id")

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on="user_id", how="inner")
    new_df = new_df.join(good_items, on="item_id", how="inner")

    new_size = len(new_df)

    print(f'После {filtering_stage + 1}го этапа фильтрации.')
    print(f'Количество пользователей: {good_users.shape[0]}.') 
    print(f'Количество айтемов: {good_items.shape[0]}')
    print()
    
    filtered_df = new_df
    is_changed = old_size != new_size
    filtering_stage += 1

filtered_df = filtered_df.with_columns(new_user_id = pl.col("user_id").rank("dense") - 1)
filtered_df = filtered_df.with_columns(new_item_id = pl.col("item_id").rank("dense") - 1)

После 1го этапа фильтрации.
Количество пользователей: 52374.
Количество айтемов: 67345

После 2го этапа фильтрации.
Количество пользователей: 40226.
Количество айтемов: 19369

После 3го этапа фильтрации.
Количество пользователей: 27501.
Количество айтемов: 17041

После 4го этапа фильтрации.
Количество пользователей: 26116.
Количество айтемов: 13727

После 5го этапа фильтрации.
Количество пользователей: 23746.
Количество айтемов: 13318

После 6го этапа фильтрации.
Количество пользователей: 23436.
Количество айтемов: 12562

После 7го этапа фильтрации.
Количество пользователей: 22787.
Количество айтемов: 12458

После 8го этапа фильтрации.
Количество пользователей: 22705.
Количество айтемов: 12247

После 9го этапа фильтрации.
Количество пользователей: 22505.
Количество айтемов: 12224

После 10го этапа фильтрации.
Количество пользователей: 22480.
Количество айтемов: 12153

После 11го этапа фильтрации.
Количество пользователей: 22408.
Количество айтемов: 12140

После 12го этапа фильтрации.
К

In [5]:
item_ids_mapping = filtered_df.group_by('new_item_id').agg(pl.col('item_id')).select(
    pl.col('item_id').list.get(0).alias('old_item_id'), pl.col('new_item_id')
)

In [6]:
filtered_df = filtered_df.sort(["new_user_id", "timestamp"])

grouped_filtered_df = filtered_df.group_by("new_user_id", maintain_order=True).agg(
    pl.all().exclude("new_user_id").exclude('item_id').exclude('user_id')
)

In [7]:
item_ids_mapping.head()

old_item_id,new_item_id
str,u64
"""B0076L73BK""",9307
"""B004SPDEWE""",7604
"""B001KPEKMS""",4064
"""B005GMYPJ4""",8259
"""B00GYB107Q""",11799


In [8]:
grouped_filtered_df.head()

new_user_id,rating,timestamp,new_item_id
u64,list[str],list[u64],list[u64]
0,"[""2.0"", ""5.0"", … ""3.0""]","[1405296000, 1405296000, … 1405296000]","[9839, 11863, … 11155]"
1,"[""3.0"", ""4.0"", … ""2.0""]","[1357430400, 1384387200, … 1402790400]","[3309, 4572, … 58]"
2,"[""5.0"", ""5.0"", … ""5.0""]","[1385337600, 1385337600, … 1386892800]","[4386, 6362, … 10253]"
3,"[""5.0"", ""5.0"", … ""5.0""]","[1366416000, 1366416000, … 1368835200]","[8968, 10130, … 10320]"
4,"[""5.0"", ""3.0"", … ""3.0""]","[1351814400, 1364688000, … 1397692800]","[8935, 8071, … 8823]"


In [9]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('new_item_id').list.len()).rows()))))

Users count: 22363
Items count: 12101
Actions count: 198502
Avg user history len: 8.876358270357287


## Content embedding creation

In [37]:
def getDF(path):
    i = 0
    df = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            df[i] = eval(line)
            i += 1

    return pd.DataFrame.from_dict(df, orient="index")

df = getDF('../data/Beauty/metadata.json')
df.head()

Unnamed: 0,asin,salesRank,imUrl,categories,title,description,price,related,brand
0,1048791,{'Books': 6334800},http://ecx.images-amazon.com/images/I/51MKP0T4...,[[Books]],"The Crucible: Performed by Stuart Pankin, Jero...",,,,
1,143561,{'Movies & TV': 376041},http://g-ecx.images-amazon.com/images/G/01/x-s...,"[[Movies & TV, Movies]]","Everyday Italian (with Giada de Laurentiis), V...","3Pack DVD set - Italian Classics, Parties and ...",12.99,"{'also_viewed': ['B0036FO6SI', 'B000KL8ODE', '...",
2,37214,{'Clothing': 1233557},http://ecx.images-amazon.com/images/I/31mCncNu...,"[[Clothing, Shoes & Jewelry, Girls], [Clothing...",Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,,6.99,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",Big Dreams
3,32069,,http://ecx.images-amazon.com/images/I/51EzU6qu...,"[[Sports & Outdoors, Other Sports, Dance, Clot...",Adult Ballet Tutu Cheetah Pink,,7.89,"{'also_bought': ['0000032050', 'B00D0DJAEG', '...",BubuBibi
4,31909,{'Toys & Games': 201847},http://ecx.images-amazon.com/images/I/41xBoP0F...,"[[Sports & Outdoors, Other Sports, Dance]]",Girls Ballet Tutu Neon Pink,High quality 3 layer ballet tutu. 12 inches in...,7.0,"{'also_bought': ['B002BZX8Z6', 'B00JHONN1S', '...",Unknown


In [38]:
def preprocess(row: pd.Series):
    row = row.fillna("None")
    return f"Title: {row['title']}. Description: {row['description']}. Categories: {', '.join(row['categories'][0])}"


def get_data(metadata_df, item_ids_mapping_df):
    filtered_df = metadata_df.join(
        item_ids_mapping_df, 
        left_on="asin", 
        right_on='old_item_id', 
        how="inner"
    ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))
    print(filtered_df.shape)
    print(filtered_df.head())

    filtered_df = filtered_df.to_pandas()
    filtered_df["combined_text"] = filtered_df.apply(preprocess, axis=1)

    import pickle
    file = open('data.pkl', 'wb')
    pickle.dump(filtered_df, file)
    file.close()


In [39]:
df.head()

Unnamed: 0,asin,salesRank,imUrl,categories,title,description,price,related,brand
0,1048791,{'Books': 6334800},http://ecx.images-amazon.com/images/I/51MKP0T4...,[[Books]],"The Crucible: Performed by Stuart Pankin, Jero...",,,,
1,143561,{'Movies & TV': 376041},http://g-ecx.images-amazon.com/images/G/01/x-s...,"[[Movies & TV, Movies]]","Everyday Italian (with Giada de Laurentiis), V...","3Pack DVD set - Italian Classics, Parties and ...",12.99,"{'also_viewed': ['B0036FO6SI', 'B000KL8ODE', '...",
2,37214,{'Clothing': 1233557},http://ecx.images-amazon.com/images/I/31mCncNu...,"[[Clothing, Shoes & Jewelry, Girls], [Clothing...",Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,,6.99,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",Big Dreams
3,32069,,http://ecx.images-amazon.com/images/I/51EzU6qu...,"[[Sports & Outdoors, Other Sports, Dance, Clot...",Adult Ballet Tutu Cheetah Pink,,7.89,"{'also_bought': ['0000032050', 'B00D0DJAEG', '...",BubuBibi
4,31909,{'Toys & Games': 201847},http://ecx.images-amazon.com/images/I/41xBoP0F...,"[[Sports & Outdoors, Other Sports, Dance]]",Girls Ballet Tutu Neon Pink,High quality 3 layer ballet tutu. 12 inches in...,7.0,"{'also_bought': ['B002BZX8Z6', 'B00JHONN1S', '...",Unknown


In [40]:
item_ids_mapping.head()

old_item_id,new_item_id
str,u64
"""B000TF70J0""",2230
"""B005KL3B64""",8390
"""B001KPEKMS""",4064
"""B004SPDEWE""",7604
"""B001MHNQYW""",4195


In [41]:
get_data(pl.from_pandas(df), item_ids_mapping)

(12101, 4)
shape: (5, 4)
┌─────────────┬────────────────────────────┬───────────────────────────┬───────────────────────────┐
│ new_item_id ┆ title                      ┆ description               ┆ categories                │
│ ---         ┆ ---                        ┆ ---                       ┆ ---                       │
│ u64         ┆ str                        ┆ str                       ┆ list[list[str]]           │
╞═════════════╪════════════════════════════╪═══════════════════════════╪═══════════════════════════╡
│ 0           ┆ WAWO 15 Color Professionl  ┆ An extensive range of 15  ┆ [["Beauty", "Makeup", …   │
│             ┆ Make…                      ┆ multi…                    ┆ "Conce…                   │
│ 1           ┆ Xtreme Brite Brightening   ┆ Xtreme Brite  Brightening ┆ [["Beauty", "Hair Care",  │
│             ┆ Gel 1…                     ┆ gel …                     ┆ … "Cr…                    │
│ 2           ┆ Prada Candy By Prada Eau   ┆ Prada Candy By Prada 

In [13]:
from tqdm import tqdm as tqdm

from transformers import T5Model, T5Tokenizer
import torch
from torch.utils.data import DataLoader

import pickle

file = open('data.pkl', 'rb')
data = pickle.load(file)
file.close()

model_name = "google-t5/t5-base"

device = torch.device('cuda:1')

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5Model.from_pretrained(model_name)
encoder = model.encoder.to(device)
encoder = encoder.eval()


class MyDataset:

    def __init__(self, data):
        self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        text = self._data[idx][1]
        inputs = tokenizer(text, return_tensors="pt", max_length=1280, truncation=True, padding="max_length")
        return {
            'item_id': self._data[idx][0],
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0]
        }
    

dataset = MyDataset(data)
len(dataset)


loader = DataLoader(dataset, batch_size=64, drop_last=False, shuffle=False, num_workers=10)
len(loader)


new_df = {
    'item_id': [],
    'embedding': []
}

for batch in tqdm(loader):

    with torch.inference_mode():
        outputs = encoder(
            input_ids=batch["input_ids"].to(device), 
            attention_mask=batch["attention_mask"].to(device)
        )
    
        embeddings = outputs.last_hidden_state  # (bs, sl, ed)
        embeddings[(~batch["attention_mask"].bool())] = 0. # (bs, sl, ed)

    new_df['item_id'] += batch['item_id'].tolist()
    new_df['embedding'] += embeddings.mean(dim=1).tolist()  # (bs, ed)


file = open('final_data.pkl', 'wb')
pickle.dump(new_df, file)
file.close()


100%|██████████| 190/190 [05:54<00:00,  1.86s/it]


## Leave-one-out split (last item for test, pre-last item for valid, the remaining part for train)

In [10]:
json_data = {}
with open('../data/Beauty/all_data.txt', 'w') as f:
    for user_id, _, _, item_ids in grouped_filtered_df.iter_rows():
        json_data[str(user_id)] = item_ids
        f.write(' '.join([str(user_id)] + [
            str(item_id) for item_id in item_ids
        ]))
        f.write('\n')

In [46]:
import json

with open('../data/Beauty/inter.json', 'w') as f:
    json.dump(json_data, f, indent=2)

## Timestamp-based split (80% for train, 10% for valid, and 10% for test)

In [None]:
valid_portion = 0.1
test_portion = 0.1

all_events_timestamp = []
for user_id, user_interractions in user_history.items():
    for user_interraction in user_interractions:
        interractions_ts = user_interraction['timestamp']
        all_events_timestamp.append(interractions_ts)

all_events_timestamp = sorted(all_events_timestamp)

fst_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion - valid_portion))]
snd_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion))]

print(f'First train timestamp:\t{all_events_timestamp[0]}')
print(f'First valid timestamp:\t{fst_threshold}')
print(f'First test timestamp:\t{snd_threshold}')

In [15]:
train_samples = []
valid_samples = []
test_samples = []

for user_id, user_interactions in user_history.items():
    train_history = []
    history = []
    
    for user_interaction in user_interactions:
        if user_interaction['timestamp'] < fst_threshold: # train event
            assert len(history) == 0 or user_interaction['timestamp'] >= history[-1]['timestamp']
            train_history.append(user_interaction)
        elif user_interaction['timestamp'] < snd_threshold: # valid event
            assert user_interaction['timestamp'] >= fst_threshold
            if len(history) >= 5:  # remove cold-start users
                valid_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': user_interaction
                })
        else:  # test event
            assert user_interaction['timestamp'] >= snd_threshold
            if len(history) >= 5:  # remove cold-start users
                test_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': user_interaction
                })
        history.append(user_interaction)
    
    if len(train_history) >= 5:  # remove cold-start users
        train_samples.append({
            'user_id': user_id,
            'history': train_history
        })

In [None]:
len(train_samples), len(valid_samples), len(test_samples)

In [17]:
# train
with open('../data/Beauty/train.txt', 'w') as f:
    for train_sample in train_samples:
        f.write(' '.join([str(train_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(train_sample['history'], key=lambda x: x['timestamp'])
        ]))
        f.write('\n')

# valid
with open('../data/Beauty/valid.txt', 'w') as f:
    for valid_sample in valid_samples:
        f.write(' '.join([str(valid_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(valid_sample['history'], key=lambda x: x['timestamp'])
        ] + [str(valid_sample['next_interaction']['item_id'])]))
        f.write('\n')

# test
with open('../data/Beauty/test.txt', 'w') as f:
    for test_sample in test_samples:
        f.write(' '.join([str(test_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(test_sample['history'], key=lambda x: x['timestamp'])
        ] + [str(test_sample['next_interaction']['item_id'])]))
        f.write('\n')