Importing libraries

In [1]:
import scipy.sparse as sp
import numpy as np
import os
import math
import heapq
import logging
import math
import os
import easydict
from argparse import ArgumentParser
from time import time

from keras import backend as K
from keras import optimizers
from keras.layers import Input, Dense, Lambda
from keras.models import Model

Using TensorFlow backend.


Downloading the small dataset

In [2]:
!wget "https://drive.google.com/uc?export=download&id=1bEDAdHZFH3_UukpEpx9WC1P0NJDVRSI8" -O ratings.dat

--2020-05-12 06:28:07--  https://drive.google.com/uc?export=download&id=1bEDAdHZFH3_UukpEpx9WC1P0NJDVRSI8
Resolving drive.google.com (drive.google.com)... 172.217.14.78, 2607:f8b0:4007:80b::200e
Connecting to drive.google.com (drive.google.com)|172.217.14.78|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-08-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/f92uro2cv4mmbnhbujc119pq0osomi6m/1589264850000/08871784378030230384/*/1bEDAdHZFH3_UukpEpx9WC1P0NJDVRSI8?e=download [following]
--2020-05-12 06:28:08--  https://doc-14-08-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/f92uro2cv4mmbnhbujc119pq0osomi6m/1589264850000/08871784378030230384/*/1bEDAdHZFH3_UukpEpx9WC1P0NJDVRSI8?e=download
Resolving doc-14-08-docs.googleusercontent.com (doc-14-08-docs.googleusercontent.com)... 172.217.5.193, 2607:f8b0:4007:80d::2001
Connecting to doc-14-08-docs.googleusercontent.com (doc-14-08-docs.goo

Change arguments here 

In [0]:
    args = easydict.EasyDict({
    "dataset": "2k",   
    "user_layers": "[264,64]",
    "item_layers": "[512,64]",
    "epochs": 1,
    "lr": 0.001,
    "batch_size": 256,
    "num_neg": 7,
    "topN": 10
})

Reading the file

In [0]:
class DataSet(object):

    def __init__(self): 
        filename = 'ratings.dat'
        data_separator = '::'

        self.data_list, self.num_users, self.num_items, self.max_rate = \
            self.load_rating_file_as_list(filename, separator=data_separator)
        self.train, self.test = self.get_train_test()
        self.data_matrix = self.get_data_matrix()
        # self.train_data = self.get_train_instances()
        self.test_ratings, self.test_negatives = self.get_test_instances()

    @staticmethod
    def load_rating_file_as_list(filename, separator='\t'):
        print('loading rating file: %s...' % filename)
        data = []
        num_users, num_items, max_rate = 0, 0, 0
        with open(filename, 'r') as file:
            for line in file:
                if line is not None and line != '':
                    arr = line.strip().split(separator)
                    u, i, rating, timestamp = int(arr[0]), int(arr[1]), float(arr[2]), int(arr[3])
                    data.append([u, i, rating, timestamp])
                    if u > num_users:
                        num_users = u
                    if i > num_items:
                        num_items = i
                    if rating > max_rate:
                        max_rate = rating
        print('number of users: %d, number of items: %d' % (num_users, num_items))
        return data, num_users, num_items, max_rate

    def get_data_matrix(self):
        mat = np.zeros((self.num_users, self.num_items))
        for line in self.train:
            user, item, rating = line[0], line[1], line[2]
            mat[user, item] = rating
        return mat

    def get_train_test(self):
        print('splitting train and test data...')
        data = self.data_list
        data = sorted(data, key=lambda x: (x[0], x[3]))

        train = []
        test = []
        for i in range(len(data) - 1):
            user, item, rating = data[i][0], data[i][1], data[i][2]
            if data[i][0] != data[i + 1][0]:
                test.append((user - 1, item - 1, rating))
            else:
                train.append((user - 1, item - 1, rating))

        test.append((data[-1][0] - 1, data[-1][1] - 1, data[-1][2]))
        return train, test

    def get_train_instances(self, num_negatives):
        print('getting train instances...')
        user_input = []
        item_input = []
        ratings = []
        for i in self.train:
            u = i[0]
            user_input.append(u)
            item_input.append(i[1])
            ratings.append(i[2])

            # negative samples
            item_list = []
            for t in range(num_negatives):
                while True:
                    j = np.random.randint(self.num_items)
                    if self.data_matrix[u, j] == 0 and j not in item_list:
                        user_input.append(u)
                        item_input.append(j)
                        ratings.append(0)

                        item_list.append(j)
                        break
        return user_input, item_input, ratings

    def get_test_instances(self, num_negatives=100):
        print('getting test instances...')
        np.random.seed(34)
        test_ratings = []
        test_negatives = []
        for i in self.test:
            u = i[0]
            test_ratings.append([u, i[1], i[2]])
            # negative samples
            negative = []
            for t in range(num_negatives):
                while True:
                    j = np.random.randint(self.num_items)
                    if self.data_matrix[u, j] == 0 and j not in negative:
                        negative.append(j)
                        break
            test_negatives.append(negative)
        return test_ratings, test_negatives


Evaluation

In [0]:
_model = None
_test_ratings = None
_test_negatives = None
_data_matrix = None


def evaluate_model(model, test_ratings, test_negatives, data_matrix, k):
    global _model
    global _test_ratings
    global _test_negatives
    global _data_matrix

    _model = model
    _test_ratings = test_ratings
    _test_negatives = test_negatives
    _data_matrix = data_matrix

    hits, ndcgs = [], []
    for i in range(len(_test_ratings)):
        (hr, ndcg) = _evaluate_one_rating(i, k=k)
        hits.append(hr)
        ndcgs.append(ndcg)
    return hits, ndcgs


def _evaluate_one_rating(idx, k):
    rating = _test_ratings[idx]
    items = _test_negatives[idx]
    user = rating[0]
    gt_item = rating[1]
    items.append(gt_item)

    items_input = []
    users_input = []
    for item in items:
        items_input.append(_data_matrix[:, item])
        users_input.append(_data_matrix[user])
    predictions = _model.predict([np.array(users_input), np.array(items_input)],
                                 batch_size=100 + 1,
                                 verbose=0)

    map_item_score = {}
    for idx, item in enumerate(items):
        map_item_score[item] = predictions[idx]

    items.pop()
    rank_list = heapq.nlargest(k, map_item_score, key=map_item_score.get)
    hr = get_hit_ratio(rank_list, gt_item)
    ndcg = get_ndcg(rank_list, gt_item)
    return hr, ndcg


def get_hit_ratio(rank_list, gt_item):
    if gt_item in rank_list:
        return 1
    return 0


def get_ndcg(rank_list, gt_item):
    for idx, item in enumerate(rank_list):
        if item == gt_item:
            return math.log(2) / math.log(idx + 2)
    return 0


The model

In [0]:
class DMF(object):
    def __init__(self,
                 num_users,
                 num_items,
                 user_layers,
                 item_layers,
                 lr):
        self.max_rate = dataset.max_rate
        self.num_users = num_users
        self.num_items = num_items
        self.user_layers = user_layers
        self.item_layers = item_layers
        self.lr = lr

    @staticmethod
    def init_normal(shape, dtype=None):
        return K.random_normal(shape=shape, stddev=0.01, dtype=dtype)

    def mse(self, y_true, y_pred):
        loss = (y_true - y_pred)**2
        return K.mean(loss)

    def cosine_similarity_relu(self, inputs):
        x, y = inputs[0], inputs[1]
        vec = K.batch_dot(x, y) / (K.sqrt(K.batch_dot(x, x) * K.batch_dot(y, y)))
        return K.maximum(vec, 1.0e-6)

    def get_model(self):
        user_input = Input(shape=(self.num_items,), dtype='float32', name='user_input')
        item_input = Input(shape=(self.num_users,), dtype='float32', name='item_input')

        user_vector = None
        item_vector = None
        for i in range(len(self.user_layers)):
            layer = Dense(self.user_layers[i],
                          activation='relu',
                          kernel_initializer=self.init_normal,
                          bias_initializer=self.init_normal,
                          name='user_layer%d' % (i + 1))
            if i == 0:
                user_vector = layer(user_input)
            else:
                user_vector = layer(user_vector)

        for i in range(len(self.item_layers)):
            layer = Dense(self.item_layers[i],
                          activation='relu',
                          kernel_initializer=self.init_normal,
                          bias_initializer=self.init_normal,
                          name='item_layer%d' % (i + 1))
            if i == 0:
                item_vector = layer(item_input)
            else:
                item_vector = layer(item_vector)

        y_predict = Lambda(function=self.cosine_similarity_relu, name='predict')([user_vector, item_vector])
        model = Model(inputs=[user_input, item_input], outputs=y_predict)
        model.compile(optimizer=optimizers.Adam(lr=self.lr), loss=self.mse)
        return model


def generate_user_item_input(users, items, ratings, data_matrix, batch_size):
    batch = math.ceil(len(items) / batch_size)
    for batch_id in range(batch):
        user_input, item_input = [], []
        max_idx = min(len(items), (batch_id + 1) * batch_size)
        for idx in range(batch_id * batch_size, max_idx):
            u = users[idx]
            i = items[idx]
            item_input.append(data_matrix[:, i])
            user_input.append(data_matrix[u])
        target_ratings = ratings[batch_id * batch_size:max_idx]
        yield [np.array(user_input), np.array(item_input)], target_ratings

Training and testing

In [7]:
epochs = args.epochs
dmf_user_layers = eval(args.user_layers)
dmf_item_layers = eval(args.item_layers)
batch_size = args.batch_size
data_set = args.dataset
lr = args.lr
topN = args.topN
num_train_negatives = args.num_neg

print(args)


if not os.path.exists('model'):
    os.mkdir('model')
model_out_file = 'model/%s_u%s_i%s_%d_%d.h5' % (data_set, str(dmf_user_layers),
                                                str(dmf_item_layers), batch_size, time())

# load data set
dataset = DataSet()

# initialize DMF
dmf = DMF(num_users=dataset.num_users,
          num_items=dataset.num_items,
          user_layers=dmf_user_layers,
          item_layers=dmf_item_layers,
          lr=lr)
model = dmf.get_model()
model.summary()

(hits, ndcgs) = evaluate_model(model, dataset.test_ratings, dataset.test_negatives, dataset.data_matrix, topN)
hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), -1
best_hr, best_ndcg = hr, ndcg
best_iter = 0

for epoch in range(epochs):
    start = time()

    # Generate training instances
    users, items, ratings = dataset.get_train_instances(num_train_negatives)

    print('start training...')

    history = model.fit_generator(generate_user_item_input(users, items, ratings, dataset.data_matrix, batch_size),
                                  steps_per_epoch=math.ceil(len(users) / batch_size),
                                  epochs=1)

    end = time()
    print('Epoch %d Finished. [%.1f s]' % (epoch + 1, end - start))
    (hits, ndcgs) = evaluate_model(model, dataset.test_ratings, dataset.test_negatives, dataset.data_matrix, topN)
    hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(),  history.history['loss'][0]

    # print("MSE = %.2f" % (mse(np.array(ratings), model.predict(generate_user_item_input(users, items, ratings, dataset.data_matrix, batch_size)))))
    print('MSE = %.4f'
          % (ndcg))


{'dataset': '2k', 'user_layers': '[264,64]', 'item_layers': '[512,64]', 'epochs': 1, 'lr': 0.001, 'batch_size': 256, 'num_neg': 7, 'topN': 10}
loading rating file: ratings.dat...
number of users: 342, number of items: 38897
splitting train and test data...
getting test instances...
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 38897)        0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 342)          0                                            
__________________________________________________________________________________________________
user_layer1 (Dense)             (None, 264)          10269072    user_input[0][0]                 
_______