# RippleNet

In [1]:
import sys
sys.path.append("../../")
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import argparse 
from reco_utils.evaluation.python_evaluation import auc
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split

from reco_utils.recommender.ripplenet.preprocess import (read_item_index_to_entity_id_file, 
                                         convert_rating, 
                                         convert_kg)

from reco_utils.recommender.ripplenet.data_loader import (
                                         dataset_split,
                                         load_kg, 
                                         get_ripple_set)

from reco_utils.recommender.ripplenet.train import (fit, predict)

from reco_utils.recommender.ripplenet.model import RippleNet

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas version: 0.25.1


In [2]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'
# Ripple parameters
n_epoch = 10 #the number of epochs
batch_size = 1024 #batch size
dim = 16 #dimension of entity and relation embeddings
n_hop = 2 #maximum hops
kge_weight = 0.01 #weight of the KGE term
l2_weight = 1e-7 #weight of the l2 regularization term
lr = 0.02 #learning rate
n_memory = 32 #size of ripple set for each hop
item_update_mode = 'plus_transform' #how to update item at the end of each hop
using_all_hops = True #whether using outputs of all hops or just the last hop when making prediction

## Read original data and transform entity ids to numerical

In [3]:
kg_original = pd.read_csv("https://recodatasets.blob.core.windows.net/wikidata/movielens_{}_wikidata.csv".format(MOVIELENS_DATA_SIZE))
ratings_original = movielens.load_pandas_df(MOVIELENS_DATA_SIZE,
                              ('UserId', 'ItemId', 'Rating', 'Timestamp'),
                             title_col='Title',
                             genres_col='Genres',
                             year_col='Year')
rating_threshold = 4

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.20kKB/s]


In [4]:
def transform_id(df, entities_id, col_transform, col_name = "unified_id"):
    df = df.merge(entities_id, left_on = col_transform, right_on = "entity")
    df = df.rename(columns = {"unified_id": col_name})
    return df.drop(columns = [col_transform, "entity"])

In [5]:
var_id = "movielens_id"
entities_id = pd.DataFrame({"entity":list(set(kg_original.original_entity)) + list(set(kg_original.linked_entities))}).reset_index()
entities_id = entities_id.rename(columns = {"index": "unified_id"})

item_to_entity = kg_original[[var_id, "original_entity"]].drop_duplicates().reset_index().drop(columns = "index")
item_to_entity = transform_id(item_to_entity, entities_id, "original_entity")

In [6]:
kg = kg_original[["original_entity", "linked_entities"]].drop_duplicates()
kg = transform_id(kg, entities_id, "original_entity", "original_entity_id")
kg = transform_id(kg, entities_id, "linked_entities", "linked_entities_id")
kg["relation"] = 1
kg_wikidata = kg[["original_entity_id","relation", "linked_entities_id"]]

In [7]:
vars_movielens = ["UserId", "ItemId", "Rating", "Timestamp"]
ratings = ratings_original[vars_movielens].sort_values(vars_movielens[1])

## Preprocess module from RippleNet

In [8]:
item_index_old2new, entity_id2index = read_item_index_to_entity_id_file(item_to_entity)

In [9]:
ratings_final = convert_rating(ratings, item_index_old2new = item_index_old2new, threshold = rating_threshold)

converting rating file ...
number of users: 942
number of items: 1677


In [10]:
kg_final = convert_kg(kg_wikidata, entity_id2index = entity_id2index)

converting kg file ...
number of entities (containing items): 22994
number of relations: 1


## Split Data

In [11]:
train_data, test_eval_data = python_stratified_split(ratings_final, ratio=0.6, col_user='user_index', col_item='item', seed=12)
test_data, eval_data = python_stratified_split(ratings_final, ratio=0.5, col_user='user_index', col_item='item', seed=12)

In [12]:
user_history_dict = train_data.loc[train_data.rating == 1].groupby('user_index')['item'].apply(list).to_dict()

In [13]:
n_entity, n_relation, kg = load_kg(kg_final)
ripple_set = get_ripple_set(kg, user_history_dict, n_hop=n_hop, n_memory=n_memory)

reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


In [None]:
feed_dict = dict()
for user in data[start:end, 0]:
    feed_dict[model.memories_h[i]] = ripple_set[user][i][0]

## Build model

In [None]:
show_loss = False

In [None]:
ripple = RippleNet(dim=dim,n_hop=n_hop,
                   kge_weight=kge_weight, l2_weight=l2_weight, lr=lr,
                   n_memory=n_memory,
                   item_update_mode=item_update_mode, using_all_hops=using_all_hops,
                   n_entity=n_entity,n_relation=n_relation)

with tf.Session() as sess:
    model = fit(sess=sess, 
                n_epoch=n_epoch, batch_size=batch_size,n_hop=n_hop,
                model=ripple, train_data=train_data.to_numpy(), 
                ripple_set=ripple_set, show_loss=show_loss)
    labels, scores = predict(sess=sess, 
                             batch_size=batch_size, n_hop=n_hop, 
                             model=model, data=test_data.to_numpy(),
                             ripple_set=ripple_set)

predictions = [1 if i >= 0.5 else 0 for i in scores]

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true=labels, y_score=scores)