# Group dataset training and evaluation

In [2]:
from recommenders.mfi import MatrixFactorizationImplicit

In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import scipy.sparse as sps
import matplotlib.pyplot as plt
%matplotlib inline

## Read interactions data set

In [3]:
from utils import read_json_fast

interactions_file = './data/INTERACTIONS.json.gz' # user-book interactions (ratings)
df_interactions = read_json_fast(interactions_file)
df_interactions = df_interactions.loc[:, ['user_id', 'book_id', 'rating']]
df_interactions = df_interactions.rename(columns={'book_id': 'old_item', 'user_id': 'old_user'})

Processing INTERACTIONS.json.gz:


0lines [00:00, ?lines/s]

In [4]:
df_interactions.head(3)

Unnamed: 0,old_user,old_item,rating
0,8842281e1d1347389f2ab93d60773d4d,836610,0
1,8842281e1d1347389f2ab93d60773d4d,7648967,0
2,8842281e1d1347389f2ab93d60773d4d,15704307,0


In [5]:
print(df_interactions.iloc[0].old_user)

8842281e1d1347389f2ab93d60773d4d


In [6]:
df_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7347630 entries, 0 to 7347629
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   old_user  object
 1   old_item  object
 2   rating    int64 
dtypes: int64(1), object(2)
memory usage: 168.2+ MB


## Read group train-test split

In [7]:
df_grp_train0 = pd.read_csv('./group/train0.csv', dtype={'old_item': object})
df_grp_test0 = pd.read_csv('./group/test0.csv', dtype={'old_item': object})

In [8]:
df_grp_train0 = df_grp_train0.loc[:, ['user_id', 'item_id', 'old_user', 'old_item']]
df_grp_test0 = df_grp_test0.loc[:, ['user_id', 'item_id', 'old_user', 'old_item']]

In [9]:
print(df_grp_train0.info())
print(df_grp_train0.shape)
print(df_grp_test0.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5084691 entries, 0 to 5084690
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   item_id   int64 
 2   old_user  object
 3   old_item  object
dtypes: int64(2), object(2)
memory usage: 155.2+ MB
None
(5084691, 4)
(1271173, 4)


## Merge interactions with train-test datasets

In [10]:
df_train = df_grp_train0.merge(df_interactions, how='inner', on=['old_user', 'old_item'])
print(df_train.shape)
df_test = df_grp_test0.merge(df_interactions, how='inner', on=['old_user', 'old_item'])
print(df_test.shape)

(5084691, 5)
(1271173, 5)


In [11]:
df_train = df_train.loc[:, ['user_id', 'item_id', 'rating']]
df_test = df_test.loc[:, ['user_id', 'item_id', 'rating']]

In [12]:
df_all = pd.concat([df_train, df_test], ignore_index=True)
print(df_all.shape)

(6355864, 3)


In [13]:
df_train = df_train.sort_values(['user_id', 'item_id'])
df_test = df_test.sort_values(['user_id', 'item_id'])

## Create sparse matrices

In [14]:
shape = (df_all.user_id.max() + 1,  df_all.item_id.max() + 1)
train = sps.coo_matrix((df_train.rating.values, (df_train.user_id.values, df_train.item_id.values)), shape=shape, dtype=np.int32)
test = sps.coo_matrix((df_test.rating.values, (df_test.user_id.values, df_test.item_id.values)), shape=shape, dtype=np.int32)
# We store the train and test set externally to be used in the training and evaluating notebook.
sps.save_npz(f'./group/train0.npz', train)
sps.save_npz(f'./group/test0.npz', test)

In [17]:
type(df_train.rating.values)

numpy.ndarray

## Train

In [18]:
rec = MatrixFactorizationImplicit(K=100, regularization=0.01)

In [None]:
train_loss, test_loss = rec.train(train, test)

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
# for i, rec in enumerate(recommenders, start=1):
#     rec.save(f'./data/model{i}.pkl')

In [8]:
for i, rec in enumerate(recommenders, start=1):
    rec.load(f'./data/model{i}.pkl')

In [9]:
rec_at_5 = []
rec_at_10 = []
ndcg_at_5 = []
ndcg_at_10 = []

In [10]:
import utils
from recommenders.mfi import utils as mfi_utils

In [11]:
for rec, test in zip(recommenders, test_sets):
    _test = mfi_utils.build_conf_mat(test).tocsr()
    rec_at_5.append([])
    rec_at_10.append([])
    ndcg_at_5.append([])
    ndcg_at_10.append([])
    for i in tqdm(range(_test.shape[0])):
    # for i in tqdm(range(89271, 89275)):
        topk = rec.recommend(k=10, user=i)
        actual = _test[i]
        rec_at_5[-1].append(utils.recall_at_k(topk[:5], actual))
        rec_at_10[-1].append(utils.recall_at_k(topk, actual))
        ndcg_at_5[-1].append(utils.ndcg_at_k(topk[:5], actual))
        ndcg_at_10[-1].append(utils.ndcg_at_k(topk, actual))

  0%|          | 0/148438 [00:00<?, ?it/s]

  0%|          | 0/148438 [00:00<?, ?it/s]

  0%|          | 0/148438 [00:00<?, ?it/s]

  0%|          | 0/148438 [00:00<?, ?it/s]

  0%|          | 0/148438 [00:00<?, ?it/s]

In [19]:
import pickle
with open('./data/metrics.pkl', 'wb') as f:
    pickle.dump([rec_at_5, rec_at_10, ndcg_at_5, ndcg_at_10], f)

In [15]:
rec_5_avgs = []
print(f'AVG. RECALL @ 5')
for i, scores in enumerate(rec_at_5, start=1):
    avg_rec = np.mean(scores)
    rec_5_avgs.append(avg_rec)
    print(f'\tFOLD {i}: {avg_rec}')
print(f'AVG. OVER FOLDS: {np.mean(rec_5_avgs)} \u00B1 {np.std(rec_5_avgs)}')

AVG. RECALL @ 5
	FOLD 1: 0.17638614366079036
	FOLD 2: 0.17616829631984882
	FOLD 3: 0.1871915122314225
	FOLD 4: 0.17272039592558183
	FOLD 5: 0.18201576306917258
AVG. OVER FOLDS: 0.1788964222413632 ± 0.005107372627724864


In [16]:
rec_10_avgs = []
print(f'AVG. RECALL @ 10')
for i, scores in enumerate(rec_at_10, start=1):
    avg_rec = np.mean(scores)
    rec_10_avgs.append(avg_rec)
    print(f'\tFOLD {i}: {avg_rec}')
print(f'AVG. OVER FOLDS: {np.mean(rec_10_avgs)} \u00B1 {np.std(rec_10_avgs)}')

AVG. RECALL @ 10
	FOLD 1: 0.24681410601864756
	FOLD 2: 0.24431465847574663
	FOLD 3: 0.2577641470850341
	FOLD 4: 0.2402187986697141
	FOLD 5: 0.25002586470338695
AVG. OVER FOLDS: 0.24782751499050587 ± 0.0059131907043481185


In [17]:
ndcg_5_avgs = []
print(f'AVG. NDCG @ 5')
for i, scores in enumerate(ndcg_at_5, start=1):
    avg_ndcg = np.mean(scores)
    ndcg_5_avgs.append(avg_ndcg)
    print(f'\tFOLD {i}: {avg_ndcg}')
print(f'AVG. OVER FOLDS: {np.mean(ndcg_5_avgs)} \u00B1 {np.std(ndcg_5_avgs)}')

AVG. NDCG @ 5
	FOLD 1: 0.3315721916927829
	FOLD 2: 0.3361803089961092
	FOLD 3: 0.3476064521306807
	FOLD 4: 0.33267275557732723
	FOLD 5: 0.33961901669804556
AVG. OVER FOLDS: 0.3375301450189891 ± 0.005775964806239298


In [18]:
ndcg_10_avgs = []
print(f'AVG. NDCG @ 10')
for i, scores in enumerate(ndcg_at_10, start=1):
    avg_ndcg = np.mean(scores)
    ndcg_10_avgs.append(avg_ndcg)
    print(f'\tFOLD {i}: {avg_ndcg}')
print(f'AVG. OVER FOLDS: {np.mean(ndcg_10_avgs)} \u00B1 {np.std(ndcg_10_avgs)}')

AVG. NDCG @ 10
	FOLD 1: 0.36782502961300007
	FOLD 2: 0.37296343555370093
	FOLD 3: 0.38314175391352934
	FOLD 4: 0.3677694763559074
	FOLD 5: 0.3756627568133843
AVG. OVER FOLDS: 0.3734724904499044 ± 0.005709102262528303
