In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from utils import set_temporary_np_seed_as

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "Data/lastfm_small.csv"

df = pd.read_csv(data_path)

In [3]:
def _remap(df: pd.DataFrame, columns: list):
    with set_temporary_np_seed_as(2022):
        for column in columns:
            df.loc[:, column] = df.loc[:, column].map(
                dict(zip(shuffle(df[column].unique()), range(1, len(df[column].unique()) + 1))))

In [4]:
def gen_sampled_dataset(rate: float):
    n_users = df.session_id.nunique()
    n_sampled_users = int(n_users * rate)
    with set_temporary_np_seed_as(2000):
        selected_users = list(np.random.choice(df.session_id.tolist(), n_sampled_users, replace=False))

    sampled_df = df[df.session_id.isin(selected_users)]
    sampled_df = sampled_df.reset_index(drop=True)

    _remap(sampled_df, ['session_id', 'item_id'])

    return sampled_df

In [5]:
rate_list = [0.2, 0.4, 0.6, 0.8]

for r in rate_list:
    sampled_df = gen_sampled_dataset(r)
    name = "lastfm_small_" + str(r) + ".csv"
    sampled_df.to_csv("Data/" + name, index=False)