In [60]:
from polara import get_movielens_data
from polara.preprocessing.dataframes import reindex, leave_one_out

path = '/Users/a.bredikhin/Downloads/ml-10M100K/data.zip'
mldata = get_movielens_data(path, include_time=True )

In [61]:
mldata.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [62]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)

In [63]:
# test_timepoint = mldata['timestamp'].quantile(
#     q=0.9, interpolation='nearest'
# )

# test_data_ = mldata.query('timestamp >= @test_timepoint')


# train_data_ = mldata.query(
#     'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
# )

# training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

# test_data = reindex(test_data_, data_index['items'])

# print(len(training), len(test_data))
# training - pd.Dataframe с нормализованными индексами данных для обучения
# test_data - для теста


In [64]:
def get_train_test_hold(mldata):
    test_timepoint = mldata['timestamp'].quantile(
        q=0.95, interpolation='nearest')

    test_data_ = mldata.query('timestamp >= @test_timepoint')


    train_data_ = mldata.query(
        'userid not in @test_data_.userid.unique() and timestamp < @test_timepoint'
    )

    training, data_index = transform_indices(train_data_.copy(), 'userid', 'movieid')

    test_data = reindex(test_data_, data_index['items'])

    print(len(training), len(test_data))
    print(len(training['userid'].unique()))
    print(len(test_data['userid'].unique()))
    return training, test_data, data_index

In [65]:
training, test_data, data_index = get_train_test_hold(mldata)

Filtered 23202 invalid observations.
8286928 476802
64680
5132


In [66]:
testset_, holdout_ = leave_one_out(
    test_data, target='timestamp', sample_top=True, random_state=0
)

In [67]:
testset_.head()

Unnamed: 0,userid,movieid,rating,timestamp
5088,48,264,5.0,1215134949
5089,48,515,3.5,1215134907
5090,48,1888,3.5,1215135112
5091,48,2363,5.0,1215134977
5092,48,2611,3.5,1215135188


In [68]:
holdout_.head()

Unnamed: 0,userid,movieid,rating,timestamp
7443062,53198,850,4.0,1228228008
5144575,36784,5026,3.0,1228929915
480732,3667,9724,4.5,1218018730
5505701,39301,3075,4.5,1225575463
9681940,69383,4210,3.0,1229403194


In [69]:
import pandas as pd
import numpy as np

userid = data_index['users'].name
test_users = pd.Index(
    # ensure test users are the same across testing data
    np.intersect1d(
        testset_[userid].unique(),
        holdout_[userid].unique()
    )
)
testset = (
    testset_
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)
holdout = (
    holdout_
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)

In [70]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    order = 'timestamp',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'order': 'timestamp',
 'n_users': 64680,
 'n_items': 9857}

In [71]:
training

Unnamed: 0,userid,movieid,rating,timestamp
0,0,120,5.0,838985046
1,0,183,5.0,838983525
2,0,228,5.0,838983392
3,0,289,5.0,838983421
4,0,313,5.0,838983392
...,...,...,...,...
10000049,64679,2023,1.0,912580553
10000050,64679,2042,2.0,912649143
10000051,64679,2210,5.0,912577968
10000052,64679,2254,2.0,912578016


In [72]:
test_data

Unnamed: 0,userid,movieid,rating,timestamp
5088,48,264,5.0,1215134949
5089,48,515,3.5,1215134907
5090,48,1888,3.5,1215135112
5091,48,2363,5.0,1215134977
5092,48,2611,3.5,1215135188
...,...,...,...,...
9999653,71562,9392,4.0,1215124142
9999654,71562,9625,4.0,1216572048
9999655,71562,9696,4.0,1215125837
9999656,71562,9727,4.5,1215124018


In [73]:
holdout

Unnamed: 0,userid,movieid,rating,timestamp
5097,0,4630,3.5,1215135740
7651,1,2319,0.5,1213948239
8922,2,527,4.0,1217063650
10035,3,9342,4.0,1215149522
13448,4,262,3.0,1226343925
...,...,...,...,...
9989504,4991,3272,4.0,1215625082
9992075,4992,6815,3.5,1227019469
9994972,4993,4228,4.0,1215103104
9995551,4994,7110,4.0,1223625402


In [74]:
training.to_csv('../data/training2level.csv', index=False)
test_data.to_csv('../data/test2level.csv', index=False)
holdout.to_csv('../data/holdout2level.csv', index=False)

In [75]:
# 1 level
training1level, test1level, data_index_1level = get_train_test_hold(training)

Filtered 16400 invalid observations.
7368284 397947
60517
4100


In [76]:
testset_1level, holdout_1level = leave_one_out(
    test1level, target='timestamp', sample_top=True, random_state=0
)

In [77]:
import pandas as pd
import numpy as np

userid = data_index_1level['users'].name
test_users = pd.Index(
    # ensure test users are the same across testing data
    np.intersect1d(
        testset_1level[userid].unique(),
        holdout_1level[userid].unique()
    )
)
testset1l = (
    testset_1level
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)
holdout1l = (
    holdout_1level
    # reindex warm-start users for convenience
    .assign(**{userid: lambda x: test_users.get_indexer(x[userid])})
    .query(f'{userid} >= 0')
    .sort_values('userid')
)

In [81]:
len(holdout1l['userid'].unique())

3973

In [80]:
len(training1level['userid'].unique())

60517

In [82]:
training1level.to_csv('../data/training1level.csv', index=False)
testset1l.to_csv('../data/test1level.csv', index=False)
holdout1l.to_csv('../data/holdout1level.csv', index=False)