# Workshop5: Demonstrate a simple (DL) Recommendation Architecture based on learning user and item embeddings and then combining with a dot product 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import tensorflow.keras as ks
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
#%cd /content/drive/MyDrive/recsys
#from demolib import mapdata

In [None]:
# reload the movielens dataset
file = "/content/drive/My Drive/recsys/u_data.csv"
ratings_df = pd.read_csv(file)
ratings_df.columns = ['user_id','item_id','rating','datetime']
ratings_df.drop('datetime',axis=1,inplace=True)
ratings_df['item_id'] = ratings_df['item_id'].astype(str)
print(ratings_df.shape)

In [22]:
# we re-use some code from workshop2 to map the user_id's and item_id's in the raw ratings data to integer indexes
def mapdata(ratings_df):
  ratings_df["item_id"] = ratings_df["item_id"].astype(str)
  ratings_df["user_id"] = ratings_df["user_id"].astype(str)
  ratings_df["rating"]  = ratings_df["rating"].values.astype(np.float32)
  user_ids = np.sort(ratings_df["user_id"].unique()).tolist()
  umap = {x: i for i, x in enumerate(user_ids)}
  item_ids = np.sort(ratings_df["item_id"].unique()).tolist()
  imap = {x: i for i, x in enumerate(item_ids)}
  ratings_df["user_id"] = ratings_df["user_id"].map(umap) # swap userid for user index
  ratings_df["item_id"] = ratings_df["item_id"].map(imap) # swap itemid for item index
  return ratings_df, umap, imap

In [24]:
ratings_df, umap, imap = mapdata(ratings_df)

In [25]:
# train/test creation
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings_df, test_size=0.2, random_state=1)

In [26]:
# define the model
def definemodel(nusers, nitems, embeddingsdim):
    
    #add 1 to the number of items & users because the embedding layers need an 
    #extra row for items & users that do not appear in the training dataset. 
    
    # create item embedding layers
    input_items = ks.layers.Input(shape=[1], name="item_input")
    embed_items = ks.layers.Embedding(nitems + 1, embeddingsdim, name="item_embedding")(input_items)
    items_flat  = ks.layers.Flatten()(embed_items)
    
    # create user embedding layers
    input_users = ks.layers.Input(shape=[1], name="user_input")
    embed_users = ks.layers.Embedding(nusers + 1, embeddingsdim, name="user_embedding")(input_users)
    users_flat  = ks.layers.Flatten()(embed_users)
    
    # predicted rating = dotproduct of user and item embeddings
    out  = ks.layers.Dot(name="dot-product", axes=1)([items_flat, users_flat])

    model = ks.Model([input_items, input_users], out)   
    return model

In [None]:
# compile the model  
# to list the available metrics see: https://www.tensorflow.org/api_docs/python/tf/keras/metrics  
embeddingsdim = 15
model = definemodel(len(umap), len(imap), embeddingsdim)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[ks.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
# train the model
# for documentaion on keras models, e.g. to see the model.fit() parameters 
# see https://www.tensorflow.org/api_docs/python/tf/keras/Model

hist = model.fit([train.item_id, train.user_id], train.rating, batch_size=64, epochs=5, verbose=1, 
                 validation_data=([test.item_id, test.user_id], test.rating))

In [None]:
# display the model training history
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
# test the model
predictions = model.predict([test.item_id, test.user_id])

# show a sample of the predictions
for i in range(0,5):
  print("actual=",test.rating.iloc[i],"pred=",predictions[i][0], "abserr=",abs(test.rating.iloc[i]-predictions[i][0])) 
  
# compute MAE
print("MAE=",mean_absolute_error(test.rating, predictions))

In [None]:
# test the model (alternative method)
# compute the metrics defined when the model was compiled
model.evaluate([test['item_id'],test['user_id']],test['rating'])

In [None]:
# to show the metrics that are defined for the model
model.metrics_names

In [None]:
# FYI: can do repeat tests for more accurate results (optional)
# each retraining may generate a (slightly) different model
embeddingsdim = 15
maes = list()
for j in range(5):
    ks.backend.clear_session()
    model = definemodel(len(umap),len(imap),embeddingsdim)
    model.compile(optimizer='adam', loss='mean_squared_error')
    weights = model.get_weights()
    res = 0
    reps = 2
    for i in range(reps):
        model.set_weights(weights)
        hist = model.fit([train.item_id, train.user_id], train.rating, batch_size=64, epochs=5, verbose=0) # shuffle=False,
        predictions = model.predict([test.item_id, test.user_id])
        mae = mean_absolute_error(test.rating, predictions)
        #print(i, "mae=", mae)
        res = res + mae
    print(j, "avg mae=",res/reps)
    maes.append(res/reps)  
plt.plot(maes)

# Visualizing the Item Embeddings with TensorFlow Embedding Projector

In [15]:
# load the movie names from file and create a lookup dict. between item index and title
file = "/content/drive/My Drive/recsys/u_item.csv"
titles = pd.read_csv(file, dtype=str)
titles["itemidx"] = titles["movie id"].map(imap)
titlelookup = dict(zip(titles["itemidx"],titles["movie name"]))

In [None]:
# Extract the item embeddings from the trained model
item_em = model.get_layer('item_embedding')
item_em_weights = item_em.get_weights()[0]
item_em_weights.shape

In [17]:
#save two tsv files: one containing the embedding weights, and the other containing the corresponding item title.
out_v = open('vecs.tsv', 'w')
out_m = open('meta.tsv', 'w',encoding='utf-8')
iids =list(ratings_df.item_id.unique())
for i in iids:
    title = titlelookup[i]
    embeddings = item_em_weights[i]
    out_m.write(title + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
#check to see that they are now stored in your google drive
%ls

Now download the two files from Google Drive to your computer

Then go to the TensorFlow Embedding Projector page, wait for the default 
embedding to load, and then click Load to upload your tsv files.

http://projector.tensorflow.org/


# Making recommendations for a given user

In [None]:
#select a target user and make ratings predictions for all items
targetuser = 100
user = np.array([targetuser for i in range(len(iids))]) # get a duplicate userid for every possible item
pred = model.predict([np.array(iids), user])
pred

In [None]:
#sort the rating predictions, and retrieve the index of the highest 5. 
pred = pred.reshape(-1) #reshape to single dimension
preditem_idxs = (-pred).argsort()[0:5]
#print(preditem_idxs)

# display the recommendations
for idx in preditem_idxs:
  print(titles[titles.itemidx == idx][["movie id","movie name"]])

In [None]:
# for comparison, look at the items the target has rated most highly in the past
print("\nItems with high ratings from the target user\n","="*40)
toprated_itemids = ratings_df[ratings_df.user_id == targetuser].sort_values(by=['rating'], ascending=False).head(5).item_id.values
for i in toprated_itemids:
  print(titlelookup[i])

ASIDE:  Another way to use the embeddings is to use them to create a recommendation engine based on similar items (like content-based filtering). In this case the embedding's would be used instead of the content features. We would use a similarity measure to finds the nearest items to the items already liked by the user.


# WORKSHOP EXTENSION  (Optional if time)
Try alternative model achitectures and see which gives the best MAE on the test data

In [None]:
# E.g. one alternative model is to concatenate the user and item embeddings and use this as input into a 2-layer classification network
# To do this insert the following lines into the appropriate place in the definemodel() function above
# then repeat the model build and test to see if it gives better results!

out = ks.layers.Concatenate()([items_flat, users_flat])
out = ks.layers.Dense(128, activation='relu')(out)
out = ks.layers.Dense(1, activation='relu')(out)