## Data preprocessing

We load the data and create train/validation/test splits following strong generalization: 

- We split all users into training/validation/test sets. 

- We train models using the entire click history of the training users. 

- To evaluate, we take part of the click history from held-out (validation and test) users to learn the necessary user-level representations for the model and then compute metrics by looking at how well the model ranks the rest of the unseen click history from the held-out users.

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import os
import sys

In [2]:
# Load the dataset
df = pd.read_csv(r'C:\Users\FOMO\Desktop\Proj\Dataset\Douban\ratings.csv')  

In [3]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,228153,2348372,4
1,228153,3216007,5
2,228153,1261560,5
3,228153,3138847,5
4,228153,1044177,5


In [4]:
threshold = 3  # rating threshold
# Given that the threshold is set to 5, we'll first filter the data accordingly.
raw_data = df[df['rating'] > threshold]

In [5]:
min_items_per_user = 5 # minimum items per user
min_users_per_item = 0 # minimum users per item

### Data splitting procedure

- Select 10K users as heldout users, 10K users as validation users, and the rest of the users for training
- Use all the items from the training users as item set
- For each of both validation and test user, subsample 80% as fold-in data and the rest for prediction 

In [6]:
def get_count(tp, id):
   playcount_groupbyid = tp.groupby(id)
   count = playcount_groupbyid.size()
   return count

In [7]:
def filter_triplets(tp, min_uc=min_items_per_user, min_sc=min_users_per_item): 
    if min_sc > 0:
        itemcount = get_count(tp, 'book_id')
        tp = tp[tp['book_id'].isin(itemcount.index[itemcount >= min_sc])]
    
    if min_uc > 0:
        usercount = get_count(tp, 'user_id')
        tp = tp[tp['user_id'].isin(usercount.index[usercount >= min_uc])]
    
    usercount, itemcount = get_count(tp, 'user_id'), get_count(tp, 'book_id') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 10 users

In [8]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [9]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d reading events from %d users and %d books (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 2261703 reading events from 152842 users and 78238 books (sparsity: 0.019%)


In [10]:
# To randomise user IDs in the user activity dataset for subsequent user data segmentation
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [11]:
n_heldout_users = 10000

In [12]:
# create train/validation/test users
n_users = unique_uid.size

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [13]:
len(tr_users), len(vd_users), len(te_users)

(132842, 10000, 10000)

In [14]:
train_plays = raw_data.loc[raw_data['user_id'].isin(tr_users)]

In [15]:
unique_sid = pd.unique(train_plays['book_id'])

In [16]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [17]:
output_dir = r'C:\Users\FOMO\Desktop\Proj\Dataset\Douban\processed_data_2'

In [18]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(os.path.join(output_dir, 'unique_sid.txt'), 'w', encoding='utf-8') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
        
with open(os.path.join(output_dir, 'unique_uid.txt'), 'w', encoding='utf-8') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [19]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list) if tr_list else pd.DataFrame()
    data_te = pd.concat(te_list) if te_list else pd.DataFrame()
    
    return data_tr, data_te

In [20]:
vad_plays = raw_data.loc[raw_data['user_id'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['book_id'].isin(unique_sid)]

In [21]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

0 users sampled


1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [22]:
test_plays = raw_data.loc[raw_data['user_id'].isin(te_users)]
test_plays = test_plays.loc[test_plays['book_id'].isin(unique_sid)]

In [23]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled


1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


### Save the data into (user_index, item_index) format

In [24]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['user_id']))
    sid = list(map(lambda x: show2id[x], tp['book_id']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [25]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(output_dir, 'train.csv'), index=False)

In [None]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(output_dir, 'validation_tr.csv'), index=False)

In [None]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(output_dir, 'validation_te.csv'), index=False)

In [None]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(output_dir, 'test_tr.csv'), index=False)

In [None]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(output_dir, 'test_te.csv'), index=False)