In [None]:
# !pip install recommenders
# !pip install tf_slim

In [None]:
import sys
import os
import shutil
# import papermill as pm
# import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.utils.constants import SEED as DEFAULT_SEED


print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Pandas version: 1.3.5
Tensorflow version: 2.8.2


In [None]:
# top k items to recommend
TOP_K = 10

# # Select MovieLens data size: 100k, 1m, 10m, or 20m
# MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 10 # 100
BATCH_SIZE = 256

SEED = DEFAULT_SEED  # Set None for non-deterministic results

In [None]:
path = '/content/drive/MyDrive/쿠아이/쿠아이 컨퍼런스/2022 하계/'

In [None]:
df = pd.read_csv(path + 'Grocery_and_Gourmet_Food_over_30_5_interaction.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,asin,overall,verified,reviewTime,reviewerID,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,count
0,0,B00BUKL666,4.0,True,"06 28, 2018",APC57NVOKOO8J,Diana,I will order again if price is reasonable.,Four Stars,1530144000,,,,94.0
1,1,B00BUKL666,5.0,True,"05 5, 2018",A1J2G68M0POA10,DrJohn,I enjoy many of the KIND bars.\nTried the Blue...,Super Good,1525478400,,,,30.0
2,2,B00BUKL666,5.0,True,"04 18, 2018",AVO94A66EQGFH,snorkie,Love these bars!,Best price on the market and store,1524009600,,,,53.0
3,3,B00BUKL666,4.0,True,"04 5, 2018",A3V0NOZORAMPUW,Chris,good,Four Stars,1522886400,,,,39.0
4,4,B00BUKL666,5.0,True,"03 29, 2018",A35BVXK8OTLZCH,JR,Kind bars are my fave low sugar bar.,Fave snack bar,1522281600,,,,60.0


In [None]:
df = df[['reviewerID', 'asin', 'overall', 'unixReviewTime']]

In [None]:
df.rename({'reviewerID':'userID', 'asin':'itemID', 'overall':'rating', 'unixReviewTime':'timestamp'}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
temp_list = df.userID.unique().tolist()

# temp_list

In [None]:
df['userID'] = df.userID.map(lambda x: temp_list.index(x))

In [None]:
temp_list = df.itemID.unique().tolist()

In [None]:
df['itemID'] = df.itemID.map(lambda x: temp_list.index(x))

In [None]:
train, test = python_chrono_split(df, 0.75)

In [None]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

In [None]:
leave_one_out_test = test.groupby("userID").last().reset_index()

In [None]:
train_file = path + "ncf_train.csv"
test_file = path + "ncf_test.csv"
leave_one_out_test_file = path + "leave_one_out_test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [None]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

100%|██████████| 2730/2730 [00:45<00:00, 59.86it/s]


In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

Took 284.266713721 seconds for training.


In [None]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,0.0,933.0,0.276515
1,0.0,972.0,0.24459
2,0.0,3597.0,0.050561
3,0.0,4995.0,0.004221
4,0.0,5196.0,0.006697


In [None]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 55.44416801900002 seconds for prediction.


In [None]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.012496
NDCG:	0.023233
Precision@K:	0.016337
Recall@K:	0.020406


In [None]:
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.277289
NDCG:	0.208330


In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="GMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/GMF")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [28.41s]: train_loss = 0.355957 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [26.67s]: train_loss = 0.310420 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [26.57s]: train_loss = 0.256258 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [27.86s]: train_loss = 0.228610 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [28.14s]: train_loss = 0.218047 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [26.68s]: train_loss = 0.212929 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [26.73s]: train_loss = 0.209210 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [27.89s]: train_loss = 0.204480 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [27.37s]: train_loss = 0.202833 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [27.90s]: train_loss = 0.200878 


Took 2753.068003332999 seconds for training.


In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="MLP",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/MLP")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [30.76s]: train_loss = 0.668815 


Took 315.30126556899995 seconds for training.


In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

model.load(gmf_dir=".pretrain/GMF", mlp_dir=".pretrain/MLP", alpha=0.5)



In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [27.83s]: train_loss = 0.322974 


Took 322.6565955430001 seconds for training.


In [None]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 18.270368184999825 seconds for prediction.


In [None]:
eval_map2 = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg2 = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision2 = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall2 = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map2,
      "NDCG:\t%f" % eval_ndcg2,
      "Precision@K:\t%f" % eval_precision2,
      "Recall@K:\t%f" % eval_recall2, sep='\n')

MAP:	0.123830
NDCG:	0.135159
Precision@K:	0.032229
Recall@K:	0.153862
