In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, save_npz, coo_matrix

In [2]:
users = pd.read_csv('./data/users.csv')
games = pd.read_csv('./data/games.csv')

unique_user_ids = users['user_id'].unique()
unique_game_ids = games['app_id'].unique()

user_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
app_index = {game_id: idx for idx, game_id in enumerate(unique_game_ids)}
reverse_user_index = {idx: user_id for user_id, idx in user_index.items()}
reverse_app_index = {idx: game_id for game_id, idx in app_index.items()}

In [3]:
rating_matrix_sparse = load_npz('./data/score_matrix.npz')
row_indices = rating_matrix_sparse.row
col_indices = rating_matrix_sparse.col
data = rating_matrix_sparse.data

entries = np.vstack((row_indices, col_indices, data)).T

positive_interactions = data > 0
user_row_counts_positive = np.bincount(row_indices, weights=data * positive_interactions, minlength=rating_matrix_sparse.shape[0])

In [4]:
users_15 = np.where(user_row_counts_positive >= 15)[0]
filtered_entries_15 = entries[np.isin(entries[:, 0], users_15)]
unique_users_15 = np.unique(filtered_entries_15[:, 0])
selected_users_15 = np.random.choice(unique_users_15, size=10000, replace=False)
test_entries_15 = filtered_entries_15[np.isin(filtered_entries_15[:, 0], selected_users_15)]

In [5]:
test_entries = np.vstack((test_entries_15))

test_indices_set = set(map(tuple, test_entries))
train_mask = np.array([tuple(entry) not in test_indices_set for entry in entries])
train_entries = entries[train_mask]

num_users, num_games = rating_matrix_sparse.shape
train_matrix = coo_matrix((train_entries[:, 2], (train_entries[:, 0], train_entries[:, 1])),
                          shape=(num_users, num_games))
test_matrix = coo_matrix((test_entries[:, 2], (test_entries[:, 0], test_entries[:, 1])),
                         shape=(num_users, num_games))

save_npz('./data/train_matrix.npz', train_matrix)
save_npz('./data/test_matrix.npz', test_matrix)

In [6]:
train_matrix = load_npz('./data/train_matrix.npz')
test_matrix = load_npz('./data/test_matrix.npz')

if not isinstance(test_matrix, coo_matrix):
    test_matrix = coo_matrix(test_matrix)

test_row = test_matrix.row
test_col = test_matrix.col
test_data = test_matrix.data

test_entries_by_user = {}
for row, col, data in zip(test_row, test_col, test_data):
    if row not in test_entries_by_user:
        test_entries_by_user[row] = []
    test_entries_by_user[row].append((col, data))

train_and_test_rows = []
train_and_test_cols = []
train_and_test_data = []

rest_test_rows = []
rest_test_cols = []
rest_test_data = []

np.random.seed(42)

for user, entries in test_entries_by_user.items():
    entries = np.array(entries)
    np.random.shuffle(entries)
    
    num_entries = len(entries)
    split_idx = int(num_entries * 0.6)
    
    for col, data in entries[:split_idx]:
        train_and_test_rows.append(user)
        train_and_test_cols.append(col)
        train_and_test_data.append(data)

    for col, data in entries[split_idx:]:
        rest_test_rows.append(user)
        rest_test_cols.append(col)
        rest_test_data.append(data)

train_and_test_matrix = coo_matrix((
    np.hstack([train_matrix.data, train_and_test_data]),
    (
        np.hstack([train_matrix.row, train_and_test_rows]),
        np.hstack([train_matrix.col, train_and_test_cols])
    )
), shape=train_matrix.shape)

rest_test_matrix = coo_matrix((
    rest_test_data,
    (rest_test_rows, rest_test_cols)
), shape=test_matrix.shape)

save_npz('./data/train_and_test.npz', train_and_test_matrix)
save_npz('./data/rest_test.npz', rest_test_matrix)