In [None]:
import numpy as np
import polars as pl
import random

In [None]:
path_to_df = '../data/Clothing/data.csv'
df = pl.read_csv(
    path_to_df,
    has_header=False,
    new_columns=['user_id', 'item_id', 'rating', 'timestamp'],
    separator=',',
    schema_overrides={
        "user_id": pl.String,
        "item_id": pl.String,
        "rating": pl.Float64,
        "timestamp": pl.UInt64
    }
)
print(df.shape)

In [None]:
df.head()

In [None]:
filtering_stage = 0
is_changed = True
threshold = 5
good_users = set()
good_items = set()

filtered_df = df.clone()

while is_changed:
    user_counts = filtered_df.group_by('user_id').agg(
        pl.len().alias('user_count'),
    )
    item_counts = filtered_df.group_by('item_id').agg(
        pl.len().alias('item_count'),
    )

    good_users = user_counts.filter(pl.col('user_count') >= threshold).select(
        'user_id',
    )
    good_items = item_counts.filter(pl.col('item_count') >= threshold).select(
        'item_id',
    )

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on='user_id', how='inner')
    new_df = new_df.join(good_items, on='item_id', how='inner')

    new_size = len(new_df)

    print(f'После {filtering_stage + 1}го этапа фильтрации.')
    print(f'Количество пользователей: {good_users.shape[0]}.')
    print(f'Количество фильмов: {good_items.shape[0]}')
    print()

    filtered_df = new_df
    is_changed = old_size != new_size
    filtering_stage += 1


filtered_df = filtered_df.with_columns(user_id=pl.col('user_id').rank('dense'))
filtered_df = filtered_df.with_columns(item_id=pl.col('item_id').rank('dense'))
filtered_df = filtered_df.sort(['user_id', 'timestamp'])

grouped_filtered_df = filtered_df.group_by('user_id', maintain_order=True).agg(
    pl.all().exclude('user_id'),
)

In [None]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print(
    'Avg user history len:',
    np.mean(
        list(
            map(
                lambda x: x[0],
                grouped_filtered_df.select(
                    pl.col('item_id').list.len(),
                ).rows(),
            ),
        ),
    ),
)

In [None]:
grouped_filtered_df = filtered_df.group_by("user_id", maintain_order=True).agg(
    pl.all().exclude("user_id")
)


In [None]:
grouped_filtered_df.head()

In [None]:
valid_portion = 0.1
test_portion = 0.1

all_user_ids = grouped_filtered_df.get_column("user_id").to_list()

random.seed(42)
random.shuffle(all_user_ids)

n_users = len(all_user_ids)
n_train = int(n_users * (1.0 - valid_portion - test_portion))
n_valid = int(n_users * valid_portion)

train_user_ids = set(all_user_ids[:n_train])
valid_user_ids = set(all_user_ids[n_train : n_train + n_valid])
test_user_ids = set(all_user_ids[n_train + n_valid:])

print(f"Users count: {n_users}")
print(f"Train users count: {len(train_user_ids)}")
print(f"Valid users count: {len(valid_user_ids)}")
print(f"Test users count: {len(test_user_ids)}")

In [None]:
train_samples = []
valid_samples = []
test_samples = []

max_len_train = 20

for user_id, item_history, _, _ in grouped_filtered_df.iter_rows():
    if user_id in train_user_ids:
        history = item_history[-max_len_train:]
        train_samples.append(
            {
                'user_id': user_id,
                'history': history,
            },
        )
    elif user_id in valid_user_ids:
        assert len(item_history) >= 5

        split_idx = int(len(item_history) * 0.8)
        assert not split_idx < 1 or split_idx >= len(item_history)
        
        history = item_history[:split_idx]
        target = item_history[split_idx:]

        valid_samples.append(
            {
                'user_id': user_id,
                'history': history,
                'target': target
            }
        )


    elif user_id in test_user_ids:
        assert len(item_history) >= 5

        split_idx = int(len(item_history) * 0.8)
        assert not split_idx < 1 or split_idx >= len(item_history)

        history = item_history[:split_idx]
        target = item_history[split_idx:]

        test_samples.append(
            {
                'user_id': user_id,
                'history': history,
                'target': target
            }
        )
    


In [None]:
len(train_samples), len(valid_samples), len(test_samples)

In [None]:
# train
with open('../data/Clothing/train.txt', 'w') as f:
    for train_sample in train_samples:
        f.write(
            ' '.join(
                [str(train_sample['user_id'])]
                + [str(item_id) for item_id in train_sample['history']],
            ),
        )
        f.write('\n')

# valid
with open('../data/Clothing/valid.txt', 'w') as f:
    for valid_sample in valid_samples:
        f.write(
            ' '.join(
                [str(valid_sample['user_id'])]
                + [str(item_id) for item_id in valid_sample['history']]
                + [str(trg) for trg in valid_sample['target']],
            ),
        )
        f.write('\n')

# test
with open('../data/Clothing/test.txt', 'w') as f:
    for test_sample in test_samples:
        f.write(
            ' '.join(
                [str(test_sample['user_id'])]
                + [str(item_id) for item_id in test_sample['history']]
                + [str(trg) for trg in test_sample['target']],
            ),
        )
        f.write('\n')