In [None]:
import numpy as np
import polars as pl
import pandas as pd

In [None]:
path_to_df = '../data/Beauty/ratings_Beauty.csv'
df = pl.read_csv(
    path_to_df, 
    has_header=False, 
    new_columns=['user_id', 'item_id', 'rating', 'timestamp'], 
    separator=',',
    schema_overrides={
        "user_id": pl.String,
        "item_id": pl.String,
        "rating": pl.String,
        'timestamp': pl.UInt64
    }
)

In [None]:
df.head()

In [None]:
filtering_stage = 0
is_changed = True
threshold = 5
good_users = set()
good_items = set()

filtered_df = df.clone()

while is_changed:
    user_counts = filtered_df.group_by("user_id").agg(pl.len().alias("user_count"))
    item_counts = filtered_df.group_by("item_id").agg(pl.len().alias("item_count"))

    good_users = user_counts.filter(pl.col("user_count") >= threshold).select("user_id")
    good_items = item_counts.filter(pl.col("item_count") >= threshold).select("item_id")

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on="user_id", how="inner")
    new_df = new_df.join(good_items, on="item_id", how="inner")

    new_size = len(new_df)

    print(f'После {filtering_stage + 1}го этапа фильтрации.')
    print(f'Количество пользователей: {good_users.shape[0]}.') 
    print(f'Количество айтемов: {good_items.shape[0]}')
    print()
    
    filtered_df = new_df
    is_changed = old_size != new_size
    filtering_stage += 1

filtered_df = filtered_df.with_columns(new_user_id = pl.col("user_id").rank("dense") - 1)
filtered_df = filtered_df.with_columns(new_item_id = pl.col("item_id").rank("dense") - 1)

In [None]:
item_ids_mapping = filtered_df.group_by('new_item_id').agg(pl.col('item_id')).select(
    pl.col('item_id').list.get(0).alias('old_item_id'), pl.col('new_item_id')
)

In [None]:
filtered_df = filtered_df.sort(["new_user_id", "timestamp"])

grouped_filtered_df = filtered_df.group_by("new_user_id", maintain_order=True).agg(
    pl.all().exclude("new_user_id").exclude('item_id').exclude('user_id')
)

In [None]:
item_ids_mapping.head()

In [None]:
grouped_filtered_df.head()

In [None]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('new_item_id').list.len()).rows()))))

## Content embedding creation

In [None]:
def getDF(path):
    i = 0
    df = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            df[i] = eval(line)
            i += 1

    return pd.DataFrame.from_dict(df, orient="index")

df = getDF('../data/Beauty/metadata.json')
df.head()

In [None]:
def preprocess(row: pd.Series):
    row = row.fillna("None")
    return f"Title: {row['title']}. Categories: {', '.join(row['categories'][0])}. Description: {row['description']}."


def get_data(metadata_df, item_ids_mapping_df):
    filtered_df = metadata_df.join(
        item_ids_mapping_df, 
        left_on="asin", 
        right_on='old_item_id', 
        how="inner"
    ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))
    print(filtered_df.shape)
    print(filtered_df.head())

    filtered_df = filtered_df.to_pandas()
    filtered_df["combined_text"] = filtered_df.apply(preprocess, axis=1)

    import pickle
    file = open('data.pkl', 'wb')
    pickle.dump(filtered_df, file)
    file.close()


In [None]:
df.head()

In [None]:
item_ids_mapping.head()

In [None]:
get_data(pl.from_pandas(df), item_ids_mapping)

In [None]:
from tqdm import tqdm as tqdm

from transformers import LlamaModel, LlamaTokenizer
import torch
from torch.utils.data import DataLoader

import pickle

file = open('data.pkl', 'rb')
data = pickle.load(file)
file.close()

device = torch.device('cuda:1')

model_name = "huggyllama/llama-7b"
tokenizer = LlamaTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = LlamaModel.from_pretrained(model_name)
model = model.to(device)
model = model.eval()


class MyDataset:

    def __init__(self, data):
        self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        text = self._data[idx][1]
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        return {
            'item_id': self._data[idx][0],
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0]
        }
    

dataset = MyDataset(data)
len(dataset)


loader = DataLoader(dataset, batch_size=16, drop_last=False, shuffle=False, num_workers=10)
len(loader)


new_df = {
    'item_id': [],
    'embedding': []
}

for batch in tqdm(loader):

    with torch.inference_mode():
        outputs = model(
            input_ids=batch["input_ids"].to(device), 
            attention_mask=batch["attention_mask"].to(device)
        )
        embeddings = outputs.last_hidden_state
    
        embeddings = outputs.last_hidden_state  # (bs, sl, ed)
        embeddings[(~batch["attention_mask"].bool())] = 0. # (bs, sl, ed)

    new_df['item_id'] += batch['item_id'].tolist()
    new_df['embedding'] += embeddings.mean(dim=1).tolist()  # (bs, ed)


file = open('final_data.pkl', 'wb')
pickle.dump(new_df, file)
file.close()


## Leave-one-out split (last item for test, pre-last item for valid, the remaining part for train)

In [None]:
json_data = {}
with open('../data/Beauty/all_data.txt', 'w') as f:
    for user_id, _, _, item_ids in grouped_filtered_df.iter_rows():
        json_data[str(user_id)] = item_ids
        f.write(' '.join([str(user_id)] + [
            str(item_id) for item_id in item_ids
        ]))
        f.write('\n')

In [None]:
import json

with open('../data/Beauty/inter.json', 'w') as f:
    json.dump(json_data, f, indent=2)

## Timestamp-based split (80% for train, 10% for valid, and 10% for test)

In [None]:
valid_portion = 0.1
test_portion = 0.1

all_events_timestamp = []
for user_id, user_interractions in user_history.items():
    for user_interraction in user_interractions:
        interractions_ts = user_interraction['timestamp']
        all_events_timestamp.append(interractions_ts)

all_events_timestamp = sorted(all_events_timestamp)

fst_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion - valid_portion))]
snd_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion))]

print(f'First train timestamp:\t{all_events_timestamp[0]}')
print(f'First valid timestamp:\t{fst_threshold}')
print(f'First test timestamp:\t{snd_threshold}')

In [None]:
train_samples = []
valid_samples = []
test_samples = []

for user_id, user_interactions in user_history.items():
    train_history = []
    history = []
    
    for user_interaction in user_interactions:
        if user_interaction['timestamp'] < fst_threshold: # train event
            assert len(history) == 0 or user_interaction['timestamp'] >= history[-1]['timestamp']
            train_history.append(user_interaction)
        elif user_interaction['timestamp'] < snd_threshold: # valid event
            assert user_interaction['timestamp'] >= fst_threshold
            if len(history) >= 5:  # remove cold-start users
                valid_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': user_interaction
                })
        else:  # test event
            assert user_interaction['timestamp'] >= snd_threshold
            if len(history) >= 5:  # remove cold-start users
                test_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': user_interaction
                })
        history.append(user_interaction)
    
    if len(train_history) >= 5:  # remove cold-start users
        train_samples.append({
            'user_id': user_id,
            'history': train_history
        })

In [None]:
len(train_samples), len(valid_samples), len(test_samples)

In [None]:
# train
with open('../data/Beauty/train.txt', 'w') as f:
    for train_sample in train_samples:
        f.write(' '.join([str(train_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(train_sample['history'], key=lambda x: x['timestamp'])
        ]))
        f.write('\n')

# valid
with open('../data/Beauty/valid.txt', 'w') as f:
    for valid_sample in valid_samples:
        f.write(' '.join([str(valid_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(valid_sample['history'], key=lambda x: x['timestamp'])
        ] + [str(valid_sample['next_interaction']['item_id'])]))
        f.write('\n')

# test
with open('../data/Beauty/test.txt', 'w') as f:
    for test_sample in test_samples:
        f.write(' '.join([str(test_sample['user_id'])] + [
            str(user_interaction['item_id']) for user_interaction in sorted(test_sample['history'], key=lambda x: x['timestamp'])
        ] + [str(test_sample['next_interaction']['item_id'])]))
        f.write('\n')