In [32]:
import gzip
import random
import json
import scipy
import numpy as np
import tensorflow as tf
from collections import defaultdict
import os
import math

# Data.py

In [2]:
# put the "dataset" folder in the root directory
base = 'dataset'
if base not in os.listdir('.'):
    print("dataset folder not found.")
    # break # made available in the final py file
else:
    interactions_Jewelry_train = os.path.join(base, 'interactions_Jewelry_train.json')
    interactions_Jewelry_train_aux = os.path.join(base, 'interactions_Jewelry_train_aux.json')
    interactions_Jewelry_train_record = os.path.join(base, 'interactions_Jewelry_train_record_aux.json')
    interactions_Jewelry_train_time = os.path.join(base, 'interactions_Jewelry_train_time_aux.json')
    interactions_Jewelry_validate = os.path.join(base, 'interactions_Jewelry_validate.json')
    interactions_Jewelry_test = os.path.join(base, 'interactions_Jewelry_test.json')
CNN_AES = "CNN_AES_feature.txt"
id2num_dict = "id2num_dict_Jewelry.json"
if CNN_AES not in os.listdir(os.path.join('.', base, "features")) or id2num_dict not in os.listdir(os.path.join('.', base, "id2num_dict")):
    print("CNN_AES and id2num_dict folders not found.")
    # break
else:
    CNN_AES = os.path.join(base, "features", CNN_AES)
    id2num_dict = os.path.join(base, "id2num_dict", id2num_dict)

Storing User, Item, Time tuples in `interactionTrain`

In [3]:
userIDs = set()
itemIDs = set()
#interactionsTrain = []
user_to_item = {}
user_time_to_item = {}
time_to_item = {}

with open(interactions_Jewelry_train) as json_file:
    data = json.load(json_file)

In [4]:
for d in data[:50]:
    u = d[0]
    i = d[1]
    r = d[2]
    #interactionsTrain.append((u,i,r))
    userIDs.add(u)
    itemIDs.add(i)
    if u in user_to_item:
        user_to_item[u].add(i)
    else:
        user_to_item[u] = {i}
    if (u,r) in user_time_to_item:
        user_time_to_item[(u,r)].add(i)
    else:
        user_time_to_item[(u,r)] = {i}
    if r in time_to_item:
        time_to_item[r].add(i)
    else:
        time_to_item[r] = {i}


Storing CNN-AES Feature in `cnn`

In [5]:
with open(CNN_AES) as cnn_txt:
    cnn = cnn_txt.readlines()

Read id2num_dict in which contains item id and the dictionary index

In [6]:
with open(id2num_dict) as id2num_dict_json:
    id2num = id2num_dict_json.readlines()

*From Library.py*
Functions for:
- Model Evaluation
- Data Reading

In [7]:
import numpy as np
from numpy import *
from xlrd import open_workbook
from xlutils.copy import copy
import json

"""
Evaluation Metrics
"""

def evaluation_F1(order, top_k, positive_item):
    e = 0.00000000000001
    top_k_items = set(order[0: top_k])
    positive_item = set(positive_item)
    precision = len(top_k_items & positive_item) / (len(top_k_items) + e)
    recall = len(top_k_items & positive_item) / (len(positive_item) + e)
    F1 = 2 * precision * recall / (precision + recall + e)
    return F1

def evaluation_NDCG(order, top_k, positive_item):
    top_k_item = order[0: top_k]
    e = 0.0000000001
    Z_u = 0
    temp = 0
    for i in range(0, top_k):
        Z_u += 1 / log2(i + 2)
        if top_k_item[i] in positive_item:
            temp += 1 / log2(i + 2)
    NDCG = temp / (Z_u + e)
    return NDCG

def save_result(intro, F1, NDCG, path):
    rexcel = open_workbook(path)
    rows = rexcel.sheets()[0].nrows
    excel = copy(rexcel)
    table = excel.get_sheet(0)
    row = rows
    table.write(row, 0, intro)
    #table.write(row, 2, 'F1')
    for i in range(len(F1)):
        table.write(row, i + 3, F1[i])
    #table.write(row, len(F1) + 4, 'NDCG')
    for i in range(len(NDCG)):
        table.write(row, i + len(F1) + 5, NDCG[i])
    excel.save(path)

"""
Read Data
"""
def readdata(dataset):
    #file paths
    path_train = interactions_Jewelry_train
    path_train_aux = interactions_Jewelry_train_aux
    path_validate = interactions_Jewelry_validate
    path_test = interactions_Jewelry_test
    # read files
    with open(path_train) as f:
        line = f.readline()
        train_data = json.loads(line)
    f.close()
    P = 0
    Q = 0
    for [u, i, r] in train_data:
        if u > P:
            P = u
        if i > Q:
            Q = i
    with open(path_train_aux) as f:
        line = f.readline()
        train_data_aux = json.loads(line)
    f.close()
    with open(path_validate) as f:
        line = f.readline()
        validate_data = json.loads(line)
    f.close()
    with open(path_test) as f:
        line = f.readline()
        test_data = json.loads(line)
    f.close()
    return train_data, train_data_aux, validate_data, test_data, P + 1, Q + 1 # P: last user_id, last item_id

def readdata_time(dataset):
    #file paths
    path_train_record_aux = interactions_Jewelry_train_record
    path_train_time_aux = interactions_Jewelry_train_time
    # read files
    with open(path_train_record_aux) as f:
        line = f.readline()
        train_record_aux = json.loads(line)
    f.close()
    with open(path_train_time_aux) as f:
        line = f.readline()
        train_time_aux = json.loads(line)
    f.close()
    return train_record_aux, train_time_aux, len(train_time_aux)

def read_feature(feature, dataset, Q):
    path_feature = CNN_AES
    path_dict = id2num_dict
    with open(path_dict) as f:
        line = f.readline()
        item_i2num_dict = json.loads(line)
    f.close()
    f = open(path_feature, 'r')
    line = eval(f.readline())
    feature = line[1]
    K = len(feature)
    F = np.zeros((Q, K))
    for i in range(0, Q):
        F[i] = feature
    for line in f:
        line = eval(line)
        item_id = line[0]
        feature = line[1]
        try:
            item_num = item_i2num_dict[item_id]
            F[item_num] = feature
        except:
            continue
    return F

def get_feature(dataset):
    # to load features
    feat_list = ['CNN', 'AES', 'CH', 'CNN_AES']             # feature list
    F = read_feature(feat_list[feat[0]], dataset, Q)
    for i in range(1, len(feat)):
        F = np.hstack((F, read_feature(feat_list[feat[i]], dataset, Q)))
    return F

In [8]:
# setup
feat = [3]                          # feature selecting, 0 for CNN, 1 for AES, 2 for CH, 3 for CNN+AES
dataset = 5                         # Datasets selecting 0 to 5 for 'All', '_Women', '_Men', '_CLothes', '_Shoes', '_Jewelry' respectively
dataset_list = ['', '_Women', '_Men', '_CLothes', '_Shoes', '_Jewelry']
# load data
train_data, train_data_aux, validate_data, test_data, P, Q = readdata(dataset_list[dataset])
# load data for tensor factorization
train_record_aux, train_time_aux, R = readdata_time(dataset_list[dataset])
# load features
F = get_feature(dataset_list[dataset]) # CNN_AES Features

# Model.py

In [4]:
class DCFA(tf.keras.Model):
    def __init__(self, P, Q, R, I, J, F, reg=1.5, mom=0.1):
        """
        Initialization of the Visually_based Bayesian Personalized Ranking Model with Aesthetic Features

        :param P: Number of Users
        :param Q: Number of Items
        :param R: Number of Time Intervals
        :param I: Dimension of each Latent Items
        :param J: Dimension of each Latent Items
        :param F: The CNN-AES Feature Matrix
        :param reg: regularize lambda
        :param mom: momentum gamma
        """
        super(DCFA, self).__init__()
        # Latent Items
        self.U = tf.Variable(np.array([np.array([(random.random() / math.sqrt(I)) for j in range(I)]) for i in range(P)]))
        print("Dimension of U: ", self.U.shape)
        self.V = tf.Variable(np.array([np.array([(random.random() / math.sqrt(I)) for j in range(I)]) for i in range(Q)]))
        print("Dimension of V: ", self.V.shape)
        self.W = tf.Variable(np.array([np.array([(random.random() / math.sqrt(J)) for j in range(J)]) for i in range(Q)]))
        print("Dimension of W: ", self.W.shape)
        self.T = tf.Variable(np.array([np.array([(random.random() / math.sqrt(J)) for j in range(J)]) for i in range(R)]))
        print("Dimension of T: ", self.T.shape)
        # Extract CNN-AES Features
        self.F, self.K = F, len(F[0])
        print("Dimension of F: ", self.F.shape)
        print("Size of F: ", self.K)
        self.M = tf.Variable(np.array([np.array([(random.random() / math.sqrt(self.K)) for j in range(self.K)]) for i in range(P)]))
        print("Dimension of M: ", self.M.shape)
        self.N = tf.Variable(np.array([np.array([(random.random() / math.sqrt(self.K)) for j in range(self.K)]) for i in range(R)]))
        print("Dimension of N: ", self.N.shape)
        # regularize coefficient and momentum coefficient
        self.reg = reg
        self.mom = mom
        # self.top_k = [5, 10, 20, 50, 100]

    def score(self, u, v, r):
        """
        Given a (user, item, time) tuple, return the score associated with the relevancy of the given item to given user
        at given timestamp

        :param u: user_id
        :param v: item_id
        :param r: timestamp
        :return: BPR score for the (user, item, time) tuple
        """
        # B = (tf.expand_dims(self.U[u], axis=0) @ tf.transpose(self.V) + tf.expand_dims(self.M[u], axis=0) @ tf.transpose(self.F))[0][v]
        # C = (tf.expand_dims(self.T[r], axis=0) @ tf.transpose(self.W) + tf.expand_dims(self.N[r], axis=0) @ tf.transpose(self.F))[0][v]
        #print(B.shape, C.shape)
        #return tf.tensordot(B, C, axes=0), B, C
        # print("score" + str(B*C))
        # return B*C

        B = tf.expand_dims(self.U[u], axis=0) @ tf.transpose(self.V) + tf.expand_dims(self.M[u], axis=0) @ tf.transpose(self.F)
        C = tf.expand_dims(self.T[r], axis=0) @ tf.transpose(self.W) + tf.expand_dims(self.N[r], axis=0) @ tf.transpose(self.F)
        print("B Shape: ", B.shape)
        print("C Shape: ", C.shape)
        print("A Shape: ", (tf.squeeze(B)*tf.squeeze(C)).shape)
        return (tf.squeeze(B)*tf.squeeze(C))[v], tf.squeeze(B)[v], tf.squeeze(C)[v]

    def score_batch(self, u, r):
        """
        Given a (user, time) pair, return an array of scores associated with the given user and given time
        :param u:
        :param r:
        :return:
        """
        B = tf.expand_dims(self.U[u], axis=0) @ tf.transpose(self.V) + tf.expand_dims(self.M[u], axis=0) @ tf.transpose(self.F)
        C = tf.expand_dims(self.T[r], axis=0) @ tf.transpose(self.W) + tf.expand_dims(self.N[r], axis=0) @ tf.transpose(self.F)
        return tf.squeeze(B)*tf.squeeze(C), tf.squeeze(B), tf.squeeze(C)

    def call(self, u, v, v_prime, r):
        """
        Calculate the BPR_OPT distance, which is a metric that measures the scoring distance between positive sample
        and the negative sample of the given (user, item, time) tuple

        :param u: user_id
        :param v: item_id
        :param v_prime: negative sample of item_id
        :param r: timestamp
        :return: loss value for the given tuple
        """
        A_i, B_i, C_i = self.score(u, v, r)
        A_j, B_j, C_j = self.score(u, v_prime, r)

        print("B_ij: ", B_i-B_j)
        print("C_ij: ", C_i-C_j)

        A_loss = np.log(tf.keras.activations.sigmoid(A_i-A_j))
        B_loss = np.log(tf.keras.activations.sigmoid(B_i-B_j))
        C_loss = np.log(tf.keras.activations.sigmoid(C_i-C_j))
        # B_loss = tf.keras.activations.sigmoid(B_i-B_j)
        # C_loss = tf.keras.activations.sigmoid(C_i-C_j)

        print("A loss: ", A_loss)
        print("B loss: ", B_loss)
        print("C loss: ", C_loss)

        return A_loss + self.mom * B_loss + self.mom * C_loss
        # TODO: Fix Lambda regularization term (i.e. self.reg and self.mom)

    def reg(self):
        """
        Return the regularization value for the current latent terms

        :return: regularization term
        """
        return self.lamb * (tf.reduce_sum(self.betaU**2) + tf.reduce_sum(self.betaI**2) +
                            tf.reduce_sum(self.gammaU**2) + tf.reduce_sum(self.gammaI**2))

In [11]:
from random import randrange
# Negative sample
item_size = len(itemIDs)
index = randrange(item_size)
itemIDs_list = list(itemIDs)
negative_item = itemIDs_list[index]

while negative_item in user_to_item[12308]:
    item_size = len(itemIDs)
    index = randrange(item_size)
    negative_item = itemIDs_list[index]
    del itemIDs_list[index]


# Engine.py

In [33]:
class engine():
    def __init__(self, learning_rate=1e-3, batch_size=64, k=5, feature=('CNN')):
        """

        :param learning_rate:
        :param batch_size:
        :param k:
        :param feature: tuple of strings, default: ['CNN'], alternatives: ['CNN', 'AES', 'CH', 'CNN_AES']
        """
        self.optimizer = None
        self.model = None

        # define hyperparameter
        self.k = k
        self.batch_size = batch_size
        self.lr = learning_rate
        self.feat = feature

        # save data
        self.userIDs = set()
        self.itemIDs = set()
        self.user_to_item = {}
        self.user_time_to_item = {}
        self.time_to_item = {}
        self.cnn = None
        self.id2num = None
        self.train_data = None
        self.train_data_aux = None
        self.validate_data = None
        self.test_data = None
        self.train_record_aux = None
        self.train_time_aux = None
        self.R = 0
        self.P = 0
        self.Q = 0
        self.F = None
        self.K = None

        # Fill data structures
        self.interactions_Jewelry_train, self.interactions_Jewelry_train_aux, self.interactions_Jewelry_train_record, self.interactions_Jewelry_train_time, self.interactions_Jewelry_validate, self.interactions_Jewelry_test, self.CNN_AES, self.id2num_dict = self.read_data()

        self.read_feature_data(self.Q)

        # Initialize Dataset
        self.train_data, self.test_data = self.read_interaction_data(self.interactions_Jewelry_train), self.read_interaction_data(self.interactions_Jewelry_test)

        self.create_model()

    def train_batch(self):
        # TODO:
        pass

    def test_batch(self):
        # TODO:
        pass


    def read_data(self, base='dataset'):
        # put the "dataset" folder in the root directory
        base = base
        if base not in os.listdir('.'):
            print("dataset folder not found.")
            # break # made available in the final py file
        else:
            interactions_Jewelry_train = os.path.join(base, 'interactions_Jewelry_train.json')
            interactions_Jewelry_train_aux = os.path.join(base, 'interactions_Jewelry_train_aux.json')
            interactions_Jewelry_train_record = os.path.join(base, 'interactions_Jewelry_train_record_aux.json')
            interactions_Jewelry_train_time = os.path.join(base, 'interactions_Jewelry_train_time_aux.json')
            interactions_Jewelry_validate = os.path.join(base, 'interactions_Jewelry_validate.json')
            interactions_Jewelry_test = os.path.join(base, 'interactions_Jewelry_test.json')
        CNN_AES = "CNN_AES_feature.txt"
        id2num_dict = "id2num_dict_Jewelry.json"

        if CNN_AES not in os.listdir(os.path.join('.', base, "features")) or id2num_dict not in os.listdir(os.path.join('.', base, "id2num_dict")):
            print("CNN_AES and id2num_dict folders not found.")
            # break
        else:
            CNN_AES = os.path.join(base, "features", CNN_AES)
            id2num_dict = os.path.join(base, "id2num_dict", id2num_dict)
        return interactions_Jewelry_train, interactions_Jewelry_train_aux, interactions_Jewelry_train_record, interactions_Jewelry_train_time, interactions_Jewelry_validate, interactions_Jewelry_test, CNN_AES, id2num_dict


    def read_interaction_data(self, filePath, data_size=50):
        print(f"Start reading the interaction data {filePath}...")
        with open(filePath) as json_file:
            data = json.load(json_file)

        for d in data[:data_size]:
            u = d[0]
            i = d[1]
            r = d[2]
            #interactionsTrain.append((u,i,r))

            self.userIDs.add(u)
            if type(i) == list:
                i = i[0]
            self.itemIDs.add(i)
            if type(r) == list:
                r = r[0]
            if u in self.user_to_item:
                self.user_to_item[u].add(i)
            else:
                self.user_to_item[u] = {i}
            if (u,r) in self.user_time_to_item:
                self.user_time_to_item[(u,r)].add(i)
            else:
                self.user_time_to_item[(u,r)] = {i}
            if r in self.time_to_item:
                self.time_to_item[r].add(i)
            else:
                self.time_to_item[r] = {i}

    def read_feature(self, feature, dataset, Q):
        path_feature = self.CNN_AES
        path_dict = self.id2num_dict
        with open(path_dict) as f:
            line = f.readline()
            item_i2num_dict = json.loads(line)
        f.close()
        f = open(path_feature, 'r')
        line = eval(f.readline())
        feature = line[1]
        self.K = len(feature)
        F = np.zeros((Q, self.K))
        for i in range(0, Q):
            F[i] = feature
        for line in f:
            line = eval(line)
            item_id = line[0]
            feature = line[1]
            try:
                item_num = item_i2num_dict[item_id]
                F[item_num] = feature
            except:
                continue
        return F

    def get_feature(self, dataset, Q):

        # feat_list = ['CNN', 'AES', 'CH', 'CNN_AES']             # feature list
        F = self.read_feature(self.feat[0], dataset, Q)
        for i in range(1, len(self.feat)):
            F = np.hstack((F, self.read_feature(self.feat[i], dataset, Q)))
        return F

    def read_feature_data(self, Q):
        """
        Main Feature Loading Function

        :param Q: number of users
        """
        print("Start getting feature data...")
        with open(self.CNN_AES) as cnn_txt:
            self.cnn = cnn_txt.readlines()
        with open(self.id2num_dict) as id2num_dict_json:
            self.id2num = id2num_dict_json.readlines()

        # setup
        feat = [3]                          # feature selecting, 0 for CNN, 1 for AES, 2 for CH, 3 for CNN+AES
        dataset = 5                         # Datasets selecting 0 to 5 for 'All', '_Women', '_Men', '_CLothes', '_Shoes', '_Jewelry' respectively
        dataset_list = ['', '_Women', '_Men', '_Clothes', '_Shoes', '_Jewelry']
        # load data
        self.train_data, self.train_data_aux, self.validate_data, self.test_data = self.readdata(dataset_list[dataset])
        # load data for tensor factorization
        self.train_record_aux, self.train_time_aux = self.readdata_time(dataset_list[dataset])
        # load features
        self.F = self.get_feature(dataset_list[dataset], Q) # CNN_AES Features
        print("Populated the feature data...")

    def readdata(self, dataset):
        #file paths
        path_train = self.interactions_Jewelry_train
        path_train_aux = self.interactions_Jewelry_train_aux
        path_validate = self.interactions_Jewelry_validate
        path_test = self.interactions_Jewelry_test
        # read files
        with open(path_train) as f:
            line = f.readline()
            train_data = json.loads(line)
        f.close()
        self.P = 0
        self.Q = 0
        for [u, i, r] in train_data:
            if u > self.P:
                self.P = u
            if i > self.Q:
                self.Q = i
        with open(path_train_aux) as f:
            line = f.readline()
            train_data_aux = json.loads(line)
        f.close()
        with open(path_validate) as f:
            line = f.readline()
            validate_data = json.loads(line)
        f.close()
        with open(path_test) as f:
            line = f.readline()
            test_data = json.loads(line)
        f.close()
        return train_data, train_data_aux, validate_data, test_data # P: last user_id, last item_id

    def readdata_time(self, dataset):
        #file paths
        path_train_record_aux = self.interactions_Jewelry_train_record
        path_train_time_aux = self.interactions_Jewelry_train_time
        # read files
        with open(path_train_record_aux) as f:
            line = f.readline()
            train_record_aux = json.loads(line)
        f.close()
        with open(path_train_time_aux) as f:
            line = f.readline()
            train_time_aux = json.loads(line)
        f.close()
        self.Q = len(train_time_aux)
        return train_record_aux, train_time_aux

    def create_model(self):
        """
        Generate the model
        """
        self.model = DCFA(P=self.P, Q=self.Q, R=self.R, I=200, J=200, F=self.F, reg=1.5, mom=0.1)

    #def create_negative(self):

In [34]:
unlimited_power = engine()

Start getting feature data...
Start reading the interaction data dataset/interactions_Jewelry_train.json...
Start reading the interaction data dataset/interactions_Jewelry_test.json...
Debug:  [2, 1]
Debug:  [8196, 3]
Debug:  [6, 1]
Debug:  [8206, 2]
Debug:  [19, 1]
Debug:  [22, 2]
Debug:  [30, 2]
Debug:  [8226, 5]
Debug:  [8227, 1]
Debug:  [36, 3]
Debug:  [41, 1]
Debug:  [2935, 1]
Debug:  [15026, 1]
Debug:  [5469, 1]
Debug:  [8241, 2]
Debug:  [8244, 1]
Debug:  [8248, 1]
Debug:  [57, 2]
Debug:  [8251, 2]
Debug:  [2741, 1]
Debug:  [66, 1]
Debug:  [12299, 1]
Debug:  [8261, 2]
Debug:  [8262, 1]
Debug:  [1377, 2]
Debug:  [12300, 1]
Debug:  [8267, 2]
Debug:  [80, 1]
Debug:  [8280, 2]
Debug:  [8281, 1]
Debug:  [90, 3]
Debug:  [8286, 1]
Debug:  [96, 2]
Debug:  [8208, 1]
Debug:  [1382, 3]
Debug:  [8296, 3]
Debug:  [6844, 1]
Debug:  [8300, 1]
Debug:  [8301, 2]
Debug:  [120, 3]
Debug:  [123, 3]
Debug:  [819, 1]
Debug:  [127, 2]
Debug:  [128, 3]
Debug:  [8321, 1]
Debug:  [130, 2]
Debug:  [12310, 

IndexError: index 0 is out of bounds for axis 0 with size 0

In [32]:
unlimited_power.model.score(12308, 7, 182)

B Shape:  (1, 3607)
C Shape:  (1, 3607)
A Shape:  (3607,)


(<tf.Tensor: shape=(), dtype=float64, numpy=0.06838241247835937>,
 <tf.Tensor: shape=(), dtype=float64, numpy=0.25331095412157273>,
 <tf.Tensor: shape=(), dtype=float64, numpy=0.2699544230745753>)

In [33]:
unlimited_power.model.call(12308, 0, negative_item, 182)

B Shape:  (1, 3607)
C Shape:  (1, 3607)
A Shape:  (3607,)
B Shape:  (1, 3607)
C Shape:  (1, 3607)
A Shape:  (3607,)
B_ij:  tf.Tensor(0.041065853788441437, shape=(), dtype=float64)
C_ij:  tf.Tensor(0.040162517684509236, shape=(), dtype=float64)
A loss:  -0.6823751915501121
B loss:  -0.6728250393985218
C loss:  -0.6732675366461449


-0.8169844491545787