In [2]:
from polara import get_movielens_data
from polara.preprocessing.dataframes import reindex, leave_one_out

mldata, genres = get_movielens_data(include_time=True, get_genres=True)

In [3]:
mldata.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)

In [22]:
test_timepoint = mldata['timestamp'].quantile(
    q=0.8, interpolation='nearest'
)

test_data_ = mldata.query('timestamp >= @test_timepoint')


train_data_ = mldata.query(
    'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
)

training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

test_data = reindex(test_data_, data_index['items'])

print(len(training), len(test_data))
# training - pd.Dataframe с нормализованными индексами данных для обучения
# test_data - для теста


Filtered 177 invalid observations.
551996 199868


In [31]:
testset_, holdout_ = leave_one_out(
    test_data, target='timestamp', sample_top=True, random_state=0
)
testset_valid_, holdout_valid_ = leave_one_out(
    testset_, target='timestamp', sample_top=True, random_state=0
)

In [34]:
import pandas as pd
import numpy as np

userid = data_index['users'].name
test_users = pd.Index(
    # ensure test users are the same across testing data
    np.intersect1d(
        testset_valid_[userid].unique(),
        holdout_valid_[userid].unique()
    )
)
testset_valid = (
    testset_valid_
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)
holdout_valid = (
    holdout_valid_
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)

In [35]:
test_users

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            5972, 5985, 5991, 5995, 5996, 5998, 6001, 6002, 6016, 6040],
           dtype='int64', length=1705)

In [36]:
testset_valid

Unnamed: 0,userid,movieid,rating,timestamp
0,0,1063,5,978300760
29,0,684,3,978824268
30,0,2029,4,978824291
31,0,2879,4,978300019
33,0,562,4,978824268
...,...,...,...,...
999905,1704,857,5,997454160
999871,1704,1822,4,997453982
1000169,1704,2437,4,997454180
1000019,1704,2623,4,997454429


In [37]:
holdout_valid

Unnamed: 0,userid,movieid,rating,timestamp
32,0,1390,4,978824330
66,1,1496,3,978300174
187,2,100,4,978298486
235,3,2657,4,978294282
372,4,2800,2,978246576
...,...,...,...,...
992874,1700,1653,2,1001832291
993738,1701,126,3,1004811957
993900,1702,1684,4,1014524758
996774,1703,3406,3,995664198


In [38]:

data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    order = 'timestamp',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'order': 'timestamp',
 'n_users': 4257,
 'n_items': 3586}

In [40]:
from scipy.sparse import csr_matrix
m = matrix_from_data(training, data_description)

In [41]:
m

<4257x3586 sparse matrix of type '<class 'numpy.float64'>'
	with 551996 stored elements in Compressed Sparse Row format>