In [None]:
import numpy as np
import polars as pl
import pandas as pd

In [None]:
path_to_df = '../data/ML10M/ratings.dat'

In [None]:
df = pd.read_csv(path_to_df, sep='::', engine='python', names=['user_id','item_id','rating','timestamp'])
df = pl.from_pandas(df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.null_count()

In [None]:
df.select('user_id').max(), df.select('user_id').unique().shape

In [None]:
df = df.with_columns(user_id = pl.col("user_id").rank("dense"))
df.select('user_id').min(), df.select('user_id').max(), df.select('user_id').unique().shape

In [None]:
df = df.with_columns(item_id = pl.col("item_id").rank("dense"))
df.select('item_id').min(), df.select('item_id').max(), df.select('item_id').unique().shape

In [None]:
df.head()

In [None]:
filtering_stage = 0
is_changed = True
threshold = 15
good_users = set()
good_items = set()

filtered_df = df.clone()

while is_changed:
    user_counts = filtered_df.group_by("user_id").agg(pl.len().alias("user_count"))
    item_counts = filtered_df.group_by("item_id").agg(pl.len().alias("item_count"))

    good_users = user_counts.filter(pl.col("user_count") >= threshold).select("user_id")
    good_items = item_counts.filter(pl.col("item_count") >= threshold).select("item_id")

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on="user_id", how="inner")
    new_df = new_df.join(good_items, on="item_id", how="inner")

    new_size = len(new_df)

    print(f'После {filtering_stage + 1}го этапа фильтрации.')
    print(f'Количество пользователей: {good_users.shape[0]}.') 
    print(f'Количество фильмов: {good_items.shape[0]}')
    print()
    
    filtered_df = new_df
    is_changed = old_size != new_size
    filtering_stage += 1


filtered_df = filtered_df.with_columns(user_id = pl.col("user_id").rank("dense"))
filtered_df = filtered_df.with_columns(item_id = pl.col("item_id").rank("dense"))
filtered_df = filtered_df.sort(["user_id", "timestamp"])

grouped_filtered_df = filtered_df.group_by("user_id", maintain_order=True).agg(
    pl.all().exclude("user_id")
)

In [None]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('item_id').list.len()).rows()))))

## Leave-one-out split (last item for test, pre-last item for valid, the remaining part for train)

In [None]:
with open('../data/ML10M/all_data.txt', 'w') as f:
    for user_id, item_history, rating, timestamp in grouped_filtered_df.iter_rows():
        f.write(' '.join([str(user_id)] + [str(item_id) for item_id in item_history]))
        f.write('\n')

## Timestamp-based split (80% for train, 10% for valid, and 10% for test)

In [None]:
valid_portion = 0.1
test_portion = 0.1

all_events_timestamp = []
for _, _, _, timestamp in filtered_df.iter_rows():
    all_events_timestamp.append(timestamp)

all_events_timestamp = sorted(all_events_timestamp)

fst_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion - valid_portion))]
snd_threshold = all_events_timestamp[int(len(all_events_timestamp) * (1.0 - test_portion))]

print(f'First train timestamp:\t{all_events_timestamp[0]}')
print(f'First valid timestamp:\t{fst_threshold}')
print(f'First test timestamp:\t{snd_threshold}')

In [None]:
train_samples = []
valid_samples = []
test_samples = []

for user_id, item_history, rating, timestamp in grouped_filtered_df.iter_rows():
    train_history = []
    history = []
    history_ts = []
    
    for item_id, ts in zip(item_history, timestamp):
        if ts < fst_threshold: # train event
            assert len(history) == 0 or ts >= history_ts[-1]
            train_history.append(item_id)
        elif ts < snd_threshold: # valid event
            assert len(history) == 0 or ts >= history_ts[-1]
            if len(history) >= 5:  # remove cold-start users
                valid_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': item_id
                })
        else:  # test event
            assert len(history) == 0 or ts >= history_ts[-1]
            if len(history) >= 5:  # remove cold-start users
                test_samples.append({
                    'user_id': user_id,
                    'history': [x for x in history],
                    'next_interaction': item_id
                })

        history.append(item_id)
        history_ts.append(ts)
    
    if len(train_history) >= 5:  # remove cold-start users
        train_samples.append({
            'user_id': user_id,
            'history': train_history
        })

In [None]:
len(train_samples), len(valid_samples), len(test_samples)

In [None]:
# train
with open('../data/ML10M/train.txt', 'w') as f:
    for train_sample in train_samples:
        f.write(' '.join([str(train_sample['user_id'])] + [str(item_id) for item_id in train_sample['history']]))
        f.write('\n')

# valid
with open('../data/ML10M/valid.txt', 'w') as f:
    for valid_sample in valid_samples:
        f.write(' '.join([str(valid_sample['user_id'])] + [str(item_id) for item_id in valid_sample['history']] + [str(valid_sample['next_interaction'])]))
        f.write('\n')

# test
with open('../data/ML10M/test.txt', 'w') as f:
    for test_sample in test_samples:
        f.write(' '.join([str(test_sample['user_id'])] + [str(item_id) for item_id in test_sample['history']] + [str(test_sample['next_interaction'])]))
        f.write('\n')

In [None]:
data = [0] * 10000

for train_sample in train_samples:
    for item_id in train_sample['history']:
        data[item_id] += 1

import pickle
with open('../data/ML10M/item_cnt.pkl', 'wb') as f:
    pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)