## Mounting Google Drive

In [None]:
# Mount the google drive in google colab

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## **Importing Essential Libraries**

In [None]:
# Importing essential libraries for computation

import pandas as pd
import numpy as np
import tensorflow as tf
import math
import heapq
from tqdm import tqdm

## **Loading and Preprocessing Dataset**

In [None]:
# Loading, and Checking the dataset


count = 0
with open('/content/drive/My Drive/colab_notes/datasets/australian_users_items.json') as f:
  users = dict()
  for line in f:
    count += 1
number_of_users = count
print(number_of_users)

88310


In [None]:
# Parsing the json dataset into a dictionary containing user_id, items_count and items

count = 0
with open('/content/drive/My Drive/colab_notes/datasets/australian_users_items.json') as f:
  users = dict()
  for line in f:
    
    line = line.split(',')
    #print(line)
    #line = str(line).split(':')
    
    user = dict()
    
    #print(line[0])
    user['user_id'] = line[0]
    user['items_count'] = line[1]
    user['items'] = line[5:]
    users[count] = user
    count+=1

In [None]:
# converting the parsed json file into a pandas dataframe
df = pd.DataFrame(list(users.items()))

In [None]:
# Extracting the user_id and items and items_counts from the dataframe into independent columns
user_id = list()
items_count = list()
items = list()
for i in range(len(df)):
  user_id.append(df[1][i]['user_id'])
  items_count.append(df[1][i]['items_count'])
  items.append(df[1][i]['items'])

In [None]:
# # Extracting the specific values of user_id and items count
for i in range(len(user_id)):
  user_id[i] = user_id[i][11:].replace("'", "")
  items_count[i] = int(items_count[i][16:])

In [None]:
# inserting user_id and items_count into the dataframe
df['user_id'] = user_id
df['items_count'] = items_count

In [None]:
# DataFrame first five elements
df.head()

Unnamed: 0,0,1,user_id,items_count
0,0,"{'user_id': '{'user_id': '76561197970982479'',...",76561197970982479,277
1,1,"{'user_id': '{'user_id': 'js41637'', 'items_co...",js41637,888
2,2,"{'user_id': '{'user_id': 'evcentric'', 'items_...",evcentric,137
3,3,"{'user_id': '{'user_id': 'Riot-Punch'', 'items...",Riot-Punch,328
4,4,"{'user_id': '{'user_id': 'doctr'', 'items_coun...",doctr,541


In [None]:
# Distribution of items_counts
df.describe()

Unnamed: 0,0,items_count
count,88310.0,88310.0
mean,44154.5,58.353629
std,25493.045473,122.312095
min,0.0,0.0
25%,22077.25,3.0
50%,44154.5,26.0
75%,66231.75,73.0
max,88309.0,7762.0


In [None]:
# Since this is very sparse matrix, I will be focusing on users with more than 75th percentile items_counts
df = df[df['items_count']>188]

In [None]:
# Distribution of the items_count in the new dataframe
df.describe()

Unnamed: 0,0,items_count
count,5451.0,5451.0
mean,25270.399376,354.778206
std,17244.453144,344.401795
min,0.0,189.0
25%,11126.0,220.0
50%,22532.0,267.0
75%,37643.0,366.0
max,88303.0,7762.0


In [None]:
# adding a column to the dataframe for index
idx = list(range(len(df)))
df['idx'] = idx

In [None]:
# Droppping the first column
df.drop(0, axis=1, inplace=True)
df.head(10)

Unnamed: 0,1,user_id,items_count,idx
0,"{'user_id': '{'user_id': '76561197970982479'',...",76561197970982479,277,0
1,"{'user_id': '{'user_id': 'js41637'', 'items_co...",js41637,888,1
3,"{'user_id': '{'user_id': 'Riot-Punch'', 'items...",Riot-Punch,328,2
4,"{'user_id': '{'user_id': 'doctr'', 'items_coun...",doctr,541,3
5,{'user_id': '{'user_id': 'MinxIsBetterThanPota...,MinxIsBetterThanPotatoes,371,4
6,"{'user_id': '{'user_id': 'NitemarePK'', 'items...",NitemarePK,304,5
7,"{'user_id': '{'user_id': 'themanwich'', 'items...",themanwich,258,6
8,"{'user_id': '{'user_id': 'maplemage'', 'items_...",maplemage,629,7
13,{'user_id': '{'user_id': 'cadmusthreepointoh''...,cadmusthreepointoh,253,8
21,"{'user_id': '{'user_id': 'thequeenpanda'', 'it...",thequeenpanda,524,9


In [None]:
# Setting index for consistent indexing
df.set_index('idx', inplace=True, drop=True)

In [None]:
# Shape of the dataframe
df.shape

(5451, 3)

In [None]:
# A loop for extracting the steam video games played by every user

users = list() # A list to hold list of users' played steam video games

for i in range(len(df['user_id'])):  ## Looping over every user
  user = list()
  j = 0
  while j < df['items_count'][i]: ## Looping over the items, extracting the item names and appending to list of respective user
    if df[1][i]['items'][j][15:].strip() not in user and df[1][i]['items'][j][15:].strip() != None:
      user.append(df[1][i]['items'][j][15:].strip())
    j+=4
  users.append(user) # Appending every user's played steam video games to users list

In [None]:
# Initializing a list to hold unique steam video games played by the users
video_list = list()

In [None]:
# A loop to extract unique steam video games

for i in range(len(users)):  # looping over every user 

  len_user = len(users[i])

  for j in range(len_user): # looping over every steam video game played by the user

    if users[i][j] not in video_list:
      video_list.append(users[i][j])

In [None]:
# Formulating a sparse matrix  - every against every user against

vid_vec = list()
for vid in video_list:

  vid_values = [0]* len(users)

  for j in range(len(users)):

    if vid in users[j]:
      vid_values[j] = 1

  vid_vec.append(vid_values)

In [None]:
# Creating a dataset out of the sparse matrix

vid_df = pd.DataFrame(vid_vec)

In [None]:
#movies_df.columns = movie_list

#columns: users
#rows : steam video games played by the users

vid_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5411,5412,5413,5414,5415,5416,5417,5418,5419,5420,5421,5422,5423,5424,5425,5426,5427,5428,5429,5430,5431,5432,5433,5434,5435,5436,5437,5438,5439,5440,5441,5442,5443,5444,5445,5446,5447,5448,5449,5450
0,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Shape of the dataset
vid_df.shape

(7851, 5451)

In [None]:
# Creating a list of tuples  - (user_id, product_id)

count = 0
idx_lst = list()
for i in range(vid_df.shape[1]):

  
  for j in range(vid_df.shape[0]):
    
    if vid_df[i][j] == 1:
      idx_lst.append((i,j))

In [None]:
# Number of the user - video combination
len(idx_lst)

409325

In [None]:
# Creating a dataframe from the list of tuples
# A dataframe focused on the interaction between users (reviewers) and steam videos()

df_review = pd.DataFrame(idx_lst, columns=['reviewerID', 'productID'])

In [None]:
# First five elements of the dataset
df_review.head()

Unnamed: 0,reviewerID,productID
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4


#### Helping Functions

In [None]:
def mask_first(x):
    """
    Return a list of 0 for the first item and 1 for all others
    """
    result = np.ones_like(x)

    result[0] = 0
    
    return result

In [None]:
def train_test_split(df):
    """
    Splits our original data into one test and one
    training set. 
    The test set is made up of one item for each user. This is
    our holdout item used to compute Top@K later.
    The training set is the same as our original data but
    without any of the holdout items.
    Args:
        df (dataframe): Our original data
    Returns:
        df_train (dataframe): All of our data except holdout items
        df_test (dataframe): Only our holdout items.
    """

    # Create two copies of our dataframe that we can modify
    df_test = df.copy(deep=True)
    df_train = df.copy(deep=True)

    # Group by user_id and select only the first item for
    # each user (our holdout).
    df_test = df_test.groupby(['reviewerID']).first()
    df_test['reviewerID'] = df_test.index
    df_test = df_test[['reviewerID', 'productID']]
    df_test.reset_index(drop=True, inplace=True)

    # Remove the same items as we for our test set in our training set.
    mask = df.groupby(['reviewerID'])['reviewerID'].transform(mask_first).astype(bool)
    df_train = df.loc[mask]

    return df_train, df_test

In [None]:
# Create training and test sets.
df_train, df_test = train_test_split(df_review)

In [None]:
 # Create lists of all unique reviewers and products
reviewers = list(np.sort(df_review.reviewerID.unique()))
products = list(np.sort(df_review.productID.unique()))

In [None]:
# Get the rows, columns and values for our matrix.
rows = df_train.reviewerID.astype(int)
cols = df_train.productID.astype(int)

In [None]:
 # Get all reviewer ids and product ids.
uids = np.array(rows.tolist())
iids = np.array(cols.tolist())

In [None]:
# Naming conventions in the notebook

# Items = products

# Users = reviewers

items = products
users = reviewers

In [None]:
def get_negatives(uids, iids, items, df_test):
    """Returns a pandas dataframe of 100 negative interactions
    based for each user in df_test.
    Args:
        uids (np.array): Numpy array of all user ids.
        iids (np.array): Numpy array of all item ids.
        items (list): List of all unique items.
        df_test (dataframe): Our test set.
    Returns:
        df_neg (dataframe): dataframe with 100 negative items 
            for each (u, i) pair in df_test.
    """

    negativeList = []
    test_u = df_test['reviewerID'].values.tolist()
    test_i = df_test['productID'].values.tolist()

    test_ratings = list(zip(test_u, test_i))
    zipped = set(zip(uids, iids))

    for (u, i) in test_ratings:
        negatives = []
        negatives.append((u, i))
        for t in range(100):
            j = np.random.randint(len(items)) # Get random item id.
            while (u, j) in zipped: # Check if there is an interaction
                j = np.random.randint(len(items)) # If yes, generate a new item id
            negatives.append(j) # Once a negative interaction is found we add it.
        negativeList.append(negatives)

    df_neg = pd.DataFrame(negativeList)

    return df_neg

In [None]:
# Sample 100 negative interactions for each user in our test data
df_neg = get_negatives(uids, iids, products, df_test)

In [None]:
from random import shuffle

def get_train_instances():
     """Samples a number of negative user-item interactions for each
     user-item pair in our testing data.
     Returns:
         user_input (list): A list of all users for each item
         item_input (list): A list of all items for every user,
             both positive and negative interactions.
         labels (list): A list of all labels. 0 or 1.
     """

     user_input, item_input, labels = [],[],[]
     zipped = set(zip(uids, iids))

     for (u, i) in zip(uids,iids):
         # Add our positive interaction
         user_input.append(u)
         item_input.append(i)
         labels.append(1)

         # Sample a number of random negative interactions
         for t in range(num_neg):
             j = np.random.randint(len(products))
             while (u, j) in zipped:
                 j = np.random.randint(len(products))
             user_input.append(u)
             item_input.append(j)
             labels.append(0)

     return user_input, item_input, labels

In [None]:
def random_mini_batches(shuffled_U, shuffled_I, shuffled_L, mini_batch_size=256):
    """Returns a list of shuffeled mini batched of a given size.
    Args:
        U (list): All users for every interaction 
        I (list): All items for every interaction
        L (list): All labels for every interaction.
    
    Returns:
        mini_batches (list): A list of minibatches containing sets
            of batch users, batch items and batch labels 
            [(u, i, l), (u, i, l) ...]
    """

    mini_batches = []

    #shuffled_U, shuffled_I, shuffled_L = shuffle(U, I, L)

    U = (shuffled_U)

    num_complete_batches = int(math.floor(len(U)/mini_batch_size))
    for k in range(0, num_complete_batches):
        mini_batch_U = shuffled_U[k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_I = shuffled_I[k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_L = shuffled_L[k * mini_batch_size : k * mini_batch_size + mini_batch_size]

        mini_batch = (mini_batch_U, mini_batch_I, mini_batch_L)
        mini_batches.append(mini_batch)

    if len(U) % mini_batch_size != 0:
        mini_batch_U = shuffled_U[num_complete_batches * mini_batch_size: len(U)]
        mini_batch_I = shuffled_I[num_complete_batches * mini_batch_size: len(U)]
        mini_batch_L = shuffled_L[num_complete_batches * mini_batch_size: len(U)]

        mini_batch = (mini_batch_U, mini_batch_I, mini_batch_L)
        mini_batches.append(mini_batch)

    return mini_batches

In [None]:
def get_hits(k_ranked, holdout):
    """Return 1 if an item exists in a given list and 0 if not. """

    for item in k_ranked:
        if item == holdout:
            return 1
    return 0

In [None]:

def eval_rating(idx, test_ratings, test_negatives, K):
    """Generate ratings for the users in our test set and
    check if our holdout item is among the top K highest scores.
    Args:
        idx (int): Current index
        test_ratings (list): Our test set user-item pairs
        test_negatives (list): 100 negative items for each
            user in our test set.
        K (int): number of top recommendations
    Returns:
        hr (list): A list of 1 if the holdout appeared in our
            top K predicted items. 0 if not.
    """

    map_item_score = {}

    # Get the negative interactions our user.
    items = test_negatives[idx]

    # Get the user idx.
    user_idx = test_ratings[idx][0]

    # Get the item idx, i.e. our holdout item.
    holdout = test_ratings[idx][1]

    # Add the holdout to the end of the negative interactions list.
    items.append(holdout)

    # Prepare our user and item arrays for tensorflow.
    predict_user = np.full(len(items), user_idx, dtype='int32').reshape(-1,1)
    np_items = np.array(items).reshape(-1,1)

    # Feed user and items into the TF graph .
    predictions = session.run([output_layer], feed_dict={user: predict_user, item: np_items})

    # Get the predicted scores as a list
    predictions = predictions[0].flatten().tolist()

    # Map predicted score to item id.
    for i in range(len(items)):
        current_item = items[i]
        map_item_score[current_item] = predictions[i]

    # Get the K highest ranked items as a list
    k_ranked = heapq.nlargest(K, map_item_score, key=map_item_score.get)

    # Get a list of hit or no hit.   
    hits = get_hits(k_ranked, holdout)

    return hits

In [None]:
def evaluate(df_neg, K=10):
    """Calculate the top@K hit ratio for our recommendations.
    Args:
        df_neg (dataframe): dataframe containing our holdout items
            and 100 randomly sampled negative interactions for each
            (user, item) holdout pair.
        K (int): The 'K' number of ranked predictions we want
            our holdout item to be present in. 
    Returns:
        hits (list): list of "hits". 1 if the holdout was present in 
            the K highest ranked predictions. 0 if not. 
    """

    hits = []

    test_u = df_test['reviewerID'].values.tolist()
    test_i = df_test['productID'].values.tolist()

    test_ratings = list(zip(test_u, test_i))

    df_neg = df_neg.drop(df_neg.columns[0], axis=1)
    test_negatives = df_neg.values.tolist()

    for idx in range(len(test_ratings)):
        # For each idx, call eval_one_rating
        hitrate = eval_rating(idx, test_ratings, test_negatives, K)
        hits.append(hitrate)

    return hits

## **Generalized Collaborative Filtering (GCF)**

In [None]:
#-------------
# HYPERPARAMS
#-------------

num_neg = 4
latent_features = 8
epochs = 20
batch_size = 256
learning_rate = 0.001



#-------------------------
# TENSORFLOW GRAPH
#-------------------------

graph = tf.Graph()

with graph.as_default():

    # Define input placeholders for user, item and label.
    user = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    item = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    label = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))

    # User feature embedding
    u_var = tf.Variable(tf.compat.v1.random_normal([len(users), latent_features],
                                         stddev=0.05), name='user_embedding')
    user_embedding = tf.nn.embedding_lookup(u_var, user)

    # Item feature embedding
    i_var = tf.Variable(tf.compat.v1.random_normal([len(items), latent_features],
                                         stddev=0.05), name='item_embedding')
    item_embedding = tf.nn.embedding_lookup(i_var, item)
    
    # Flatten our user and item embeddings.
    user_embedding = tf.keras.layers.Flatten()(user_embedding)
    item_embedding = tf.keras.layers.Flatten()(item_embedding)

    # Multiplying our user and item latent space vectors together 
    prediction_matrix = tf.multiply(user_embedding, item_embedding)

    # Our single neuron output layer
    output_layer = tf.keras.layers.Dense(1, 
            kernel_initializer="lecun_uniform",
            name='output_layer')(prediction_matrix)

    # Our loss function as a binary cross entropy. 
    loss = tf.compat.v1.losses.sigmoid_cross_entropy(label, output_layer)
    
    # Train using the Adam optimizer to minimize our loss.
    opt = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate)
    step = opt.minimize(loss)

    # Initialize all tensorflow variables.
    init = tf.compat.v1.global_variables_initializer()

session = tf.compat.v1.Session(config=None, graph=graph)
session.run(init)

In [None]:

for epoch in range(epochs):

    # Get our training input.
    user_input, item_input, labels = get_train_instances()

    # Generate a list of minibatches.
    minibatches = random_mini_batches(user_input, item_input, labels)

    # This has noting to do with tensorflow but gives
    # us a nice progress bar for the training
    progress = tqdm(total=len(minibatches))

    # Loop over each batch and feed our users, items and labels
    # into our graph. 
    for minibatch in minibatches:
        feed_dict = {user: np.array(minibatch[0]).reshape(-1,1),
                    item: np.array(minibatch[1]).reshape(-1,1),
                    label: np.array(minibatch[2]).reshape(-1,1)}
   
        # Execute the graph.
        _, l = session.run([step, loss], feed_dict)

        # Update the progress
        progress.update(1)
        progress.set_description('Epoch: %d - Loss: %.3f' % (epoch+1, l))

    progress.close()


# Calculate top@K    
hits = evaluate(df_neg)
print(np.array(hits).mean())

Epoch: 1 - Loss: 0.488: 100%|██████████| 7889/7889 [00:35<00:00, 223.76it/s]
Epoch: 2 - Loss: 0.579: 100%|██████████| 7889/7889 [00:35<00:00, 223.10it/s]
Epoch: 3 - Loss: 0.467: 100%|██████████| 7889/7889 [00:35<00:00, 224.07it/s]
Epoch: 4 - Loss: 0.454: 100%|██████████| 7889/7889 [00:35<00:00, 223.42it/s]
Epoch: 5 - Loss: 0.499: 100%|██████████| 7889/7889 [00:35<00:00, 222.40it/s]
Epoch: 6 - Loss: 0.420: 100%|██████████| 7889/7889 [00:34<00:00, 225.87it/s]
Epoch: 7 - Loss: 0.362: 100%|██████████| 7889/7889 [00:34<00:00, 227.38it/s]
Epoch: 8 - Loss: 0.473: 100%|██████████| 7889/7889 [00:35<00:00, 220.22it/s]
Epoch: 9 - Loss: 0.280: 100%|██████████| 7889/7889 [00:35<00:00, 221.90it/s]
Epoch: 10 - Loss: 0.406: 100%|██████████| 7889/7889 [00:35<00:00, 224.93it/s]
Epoch: 11 - Loss: 0.201: 100%|██████████| 7889/7889 [00:35<00:00, 222.48it/s]
Epoch: 12 - Loss: 0.506: 100%|██████████| 7889/7889 [00:34<00:00, 231.06it/s]
Epoch: 13 - Loss: 0.196: 100%|██████████| 7889/7889 [00:35<00:00, 221.47i

0.7475692533480095


# **Multi-Layer Perceptron (MLP)**

In [None]:

#-------------
# HYPERPARAMS
#-------------

num_neg = 4
epochs = 20
batch_size = 256
learning_rate = 0.001


#-------------------------
# TENSORFLOW GRAPH
#-------------------------

# Set up our Tensorflow graph
graph = tf.Graph()

with graph.as_default():

    # Define input placeholders for user, item and label.
    user = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    item = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    label = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))

    # User feature embedding
    u_var = tf.Variable(tf.compat.v1.random_normal([len(users), 32], stddev=0.05), name='user_embedding')
    user_embedding = tf.nn.embedding_lookup(u_var, user)

    # Item feature embedding
    i_var = tf.Variable(tf.compat.v1.random_normal([len(items), 32], stddev=0.05), name='item_embedding')
    item_embedding = tf.nn.embedding_lookup(i_var, item)

    # Flatten our user and item embeddings.
    user_embedding = tf.keras.layers.Flatten()(user_embedding)
    item_embedding = tf.keras.layers.Flatten()(item_embedding)

    # Concatenate our two embedding vectors together
    concatenated = tf.keras.layers.concatenate([user_embedding, item_embedding])

    # Add a first dropout layer.
    dropout = tf.keras.layers.Dropout(0.2)(concatenated)

    # Below we add our four hidden layers along with batch
    # normalization and dropouts. We use relu as the activation function.
    layer_1 = tf.keras.layers.Dense(64, activation='relu', name='layer1')(dropout)
    batch_norm1 = tf.keras.layers.BatchNormalization(name='batch_norm1')(layer_1)
    dropout1 = tf.keras.layers.Dropout(0.2, name='dropout1')(batch_norm1)

    layer_2 = tf.keras.layers.Dense(32, activation='relu', name='layer2')(layer_1)
    batch_norm2 = tf.keras.layers.BatchNormalization(name='batch_norm1')(layer_2)
    dropout2 = tf.keras.layers.Dropout(0.2, name='dropout1')(batch_norm2)

    layer_3 = tf.keras.layers.Dense(16, activation='relu', name='layer3')(layer_2)
    layer_4 = tf.keras.layers.Dense(8, activation='relu', name='layer4')(layer_3)

    # Our final single neuron output layer.
    output_layer = tf.keras.layers.Dense(1,
            kernel_initializer="lecun_uniform",
            name='output_layer')(layer_4)

    # Define our loss function as binary cross entropy.
    labels = tf.cast(label, tf.float32)
    logits = output_layer
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                labels=labels,
                logits=logits))

    # Train using the Adam optimizer to minimize our loss.
    opt = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate)
    step = opt.minimize(loss)

    # Initialize all tensorflow variables.
    init = tf.compat.v1.global_variables_initializer()

session = tf.compat.v1.Session(config=None, graph=graph)
session.run(init)

In [None]:
for epoch in range(epochs):

    # Get our training input.
    user_input, item_input, labels = get_train_instances()

    # Generate a list of minibatches.
    minibatches = random_mini_batches(user_input, item_input, labels)

    # This has noting to do with tensorflow but gives
    # us a nice progress bar for the training
    progress = tqdm(total=len(minibatches))

    # Loop over each batch and feed our users, items and labels
    # into our graph. 
    for minibatch in minibatches:
        feed_dict = {user: np.array(minibatch[0]).reshape(-1,1),
                    item: np.array(minibatch[1]).reshape(-1,1),
                    label: np.array(minibatch[2]).reshape(-1,1)}
   
        # Execute the graph.
        _, l = session.run([step, loss], feed_dict)

        # Update the progress
        progress.update(1)
        progress.set_description('Epoch: %d - Loss: %.3f' % (epoch+1, l))

    progress.close()


# Calculate top@K    
hits = evaluate(df_neg)
print(np.array(hits).mean())

Epoch: 1 - Loss: 0.565: 100%|██████████| 7889/7889 [01:24<00:00, 93.17it/s]
Epoch: 2 - Loss: 0.582: 100%|██████████| 7889/7889 [01:25<00:00, 92.42it/s]
Epoch: 3 - Loss: 0.612: 100%|██████████| 7889/7889 [01:23<00:00, 94.27it/s]
Epoch: 4 - Loss: 0.328: 100%|██████████| 7889/7889 [01:24<00:00, 93.51it/s]
Epoch: 5 - Loss: 0.359: 100%|██████████| 7889/7889 [01:25<00:00, 92.45it/s]
Epoch: 6 - Loss: 0.318: 100%|██████████| 7889/7889 [01:22<00:00, 95.61it/s]
Epoch: 7 - Loss: 0.452: 100%|██████████| 7889/7889 [01:23<00:00, 94.31it/s]
Epoch: 8 - Loss: 0.281: 100%|██████████| 7889/7889 [01:23<00:00, 94.41it/s]
Epoch: 9 - Loss: 0.154: 100%|██████████| 7889/7889 [01:22<00:00, 95.09it/s]
Epoch: 10 - Loss: 0.125: 100%|██████████| 7889/7889 [01:22<00:00, 95.31it/s]
Epoch: 11 - Loss: 0.087: 100%|██████████| 7889/7889 [01:23<00:00, 94.21it/s]
Epoch: 12 - Loss: 0.397: 100%|██████████| 7889/7889 [01:25<00:00, 92.63it/s]
Epoch: 13 - Loss: 0.071: 100%|██████████| 7889/7889 [01:23<00:00, 94.86it/s]
Epoch: 1

0.7387635296275913


# **Combined (Generalized Collaborative Filatering and Multi-Layer Perceptron)**

In [None]:
#-------------
# HYPERPARAMS
#-------------

num_neg = 6
latent_features = 8
epochs = 20
batch_size = 256
learning_rate = 0.01


# Load and prepare our data.
#uids, iids, df_train, df_test, df_neg, users, items, item_lookup = load_dataset()



#-------------------------
# TENSORFLOW GRAPH
#-------------------------

graph = tf.Graph()

with graph.as_default():

    # Define input placeholders for user, item and label.
    user = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    item = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))
    label = tf.compat.v1.placeholder(tf.int32, shape=(None, 1))

    # User embedding for MLP
    mlp_u_var = tf.Variable(tf.compat.v1.random_normal([len(users), 32], stddev=0.05),
            name='mlp_user_embedding')
    mlp_user_embedding = tf.nn.embedding_lookup(mlp_u_var, user)

    # Item embedding for MLP
    mlp_i_var = tf.Variable(tf.compat.v1.random_normal([len(items), 32], stddev=0.05),
            name='mlp_item_embedding')
    mlp_item_embedding = tf.nn.embedding_lookup(mlp_i_var, item)

    # User embedding for GMF
    gmf_u_var = tf.Variable(tf.compat.v1.random_normal([len(users), latent_features],
        stddev=0.05), name='gmf_user_embedding')
    gmf_user_embedding = tf.nn.embedding_lookup(gmf_u_var, user)

    # Item embedding for GMF
    gmf_i_var = tf.Variable(tf.compat.v1.random_normal([len(items), latent_features],
        stddev=0.05), name='gmf_item_embedding')
    gmf_item_embedding = tf.nn.embedding_lookup(gmf_i_var, item)

    # Our GMF layers
    gmf_user_embed = tf.keras.layers.Flatten()(gmf_user_embedding)
    gmf_item_embed = tf.keras.layers.Flatten()(gmf_item_embedding)
    gmf_matrix = tf.multiply(gmf_user_embed, gmf_item_embed)

    # Our MLP layers
    mlp_user_embed = tf.keras.layers.Flatten()(mlp_user_embedding)
    mlp_item_embed = tf.keras.layers.Flatten()(mlp_item_embedding)
    mlp_concat = tf.keras.layers.concatenate([mlp_user_embed, mlp_item_embed])

    mlp_dropout = tf.keras.layers.Dropout(0.2)(mlp_concat)

    mlp_layer_1 = tf.keras.layers.Dense(64, activation='relu', name='layer1')(mlp_dropout)
    mlp_batch_norm1 = tf.keras.layers.BatchNormalization(name='batch_norm1')(mlp_layer_1)
    mlp_dropout1 = tf.keras.layers.Dropout(0.2, name='dropout1')(mlp_batch_norm1)

    mlp_layer_2 = tf.keras.layers.Dense(32, activation='relu', name='layer2')(mlp_dropout1)
    mlp_batch_norm2 = tf.keras.layers.BatchNormalization(name='batch_norm1')(mlp_layer_2)
    mlp_dropout2 = tf.keras.layers.Dropout(0.2, name='dropout1')(mlp_batch_norm2)

    mlp_layer_3 = tf.keras.layers.Dense(16, activation='relu', name='layer3')(mlp_dropout2)
    mlp_layer_4 = tf.keras.layers.Dense(8, activation='relu', name='layer4')(mlp_layer_3)

    # We merge the two networks together
    merged_vector = tf.keras.layers.concatenate([gmf_matrix, mlp_layer_4])

    # Our final single neuron output layer. 
    output_layer = tf.keras.layers.Dense(1,
            kernel_initializer="lecun_uniform",
            name='output_layer')(merged_vector)

    # Our loss function as a binary cross entropy. 
    loss = tf.compat.v1.losses.sigmoid_cross_entropy(label, output_layer)

    # Train using the Adam optimizer to minimize our loss.
    opt = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate)
    step = opt.minimize(loss)

    # Initialize all tensorflow variables.
    init = tf.compat.v1.global_variables_initializer()


session = tf.compat.v1.Session(config=None, graph=graph)
session.run(init)

In [None]:
for epoch in range(epochs):

    # Get our training input.
    user_input, item_input, labels = get_train_instances()

    # Generate a list of minibatches.
    minibatches = random_mini_batches(user_input, item_input, labels)

    # This has noting to do with tensorflow but gives
    # us a nice progress bar for the training
    progress = tqdm(total=len(minibatches))

    # Loop over each batch and feed our users, items and labels
    # into our graph. 
    for minibatch in minibatches:
        feed_dict = {user: np.array(minibatch[0]).reshape(-1,1),
                    item: np.array(minibatch[1]).reshape(-1,1),
                    label: np.array(minibatch[2]).reshape(-1,1)}
   
        # Execute the graph.
        _, l = session.run([step, loss], feed_dict)

        # Update the progress
        progress.update(1)
        progress.set_description('Epoch: %d - Loss: %.3f' % (epoch+1, l))

    progress.close()


# Calculate top@K    
hits = evaluate(df_neg)
print(np.array(hits).mean())

Epoch: 1 - Loss: 0.356: 100%|██████████| 11044/11044 [02:11<00:00, 84.18it/s]
Epoch: 2 - Loss: 0.455: 100%|██████████| 11044/11044 [02:08<00:00, 86.14it/s]
Epoch: 3 - Loss: 0.440: 100%|██████████| 11044/11044 [02:06<00:00, 87.01it/s]
Epoch: 4 - Loss: 0.328: 100%|██████████| 11044/11044 [02:07<00:00, 86.82it/s]
Epoch: 5 - Loss: 0.296: 100%|██████████| 11044/11044 [02:05<00:00, 88.18it/s]
Epoch: 6 - Loss: 0.279: 100%|██████████| 11044/11044 [02:07<00:00, 86.35it/s]
Epoch: 7 - Loss: 0.353: 100%|██████████| 11044/11044 [02:08<00:00, 85.99it/s]
Epoch: 8 - Loss: 0.224: 100%|██████████| 11044/11044 [02:06<00:00, 87.60it/s]
Epoch: 9 - Loss: 0.282: 100%|██████████| 11044/11044 [02:05<00:00, 87.77it/s]
Epoch: 10 - Loss: 0.171: 100%|██████████| 11044/11044 [02:05<00:00, 87.80it/s]
Epoch: 11 - Loss: 0.232: 100%|██████████| 11044/11044 [02:08<00:00, 86.07it/s]
Epoch: 12 - Loss: 0.113: 100%|██████████| 11044/11044 [02:05<00:00, 88.21it/s]
Epoch: 13 - Loss: 0.251: 100%|██████████| 11044/11044 [02:05<

0.7189506512566501
