# Recommendation engine with content based filtering

<a name="1"></a>
## 1 - Packages

In [1]:
import numpy as np
from numpy import genfromtxt
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
from recsysNN_utils import *
pd.set_option("display.precision", 1)
import mlflow

<a name="2"></a>
## 2 - Movie ratings dataset

The data set is derived from the [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/) dataset.
The dataset has over 9000 movies rated by  600 users with ratings on a scale of 0.5 to 5 in 0.5 step increments. The dataset has $n_u = 610$ users, $n_m= 9742$ movies and 100836 ratings.

<a name="3"></a>
## 3 - Content-based filtering with a neural network
Content-based filtering uses item features to recommend other items similar to what the user likes, based on their previous actions or explicit feedback.

<a name="3.1"></a>
### 3.1 Training Data

The original features are the year the movie was released and the movie's genre's presented as a one-hot vector. There are 18 genres. The engineered feature is an average rating derived from the user ratings. 

The user content is composed of engineered features. A per genre average rating is computed per user.  

The target, y, is the movie rating given by the user. 

In [2]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment("Recommender system")

<Experiment: artifact_location='mlflow-artifacts:/634769499380944910', creation_time=1692300898967, experiment_id='634769499380944910', last_update_time=1692300898967, lifecycle_stage='active', name='Recommender system', tags={}>

In [3]:
mlflow.tensorflow.autolog()

In [2]:
item_train = genfromtxt('./data/ml-latest-small/content_item_train.csv',delimiter=',')
user_train = genfromtxt('./data/ml-latest-small/content_user_train.csv',delimiter=',')
y_train = genfromtxt('./data/ml-latest-small/content_y_train.csv',delimiter=',')
item_vecs = genfromtxt('./data/ml-latest-small/item_vecs.csv',delimiter=',')

num_user_features = user_train.shape[1] - 1  # remove userid during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time

print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 100836


In [22]:
user_train.shape

(80668, 19)

In [3]:
# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
item_train = scalerItem.fit_transform(item_train)

scalerUser = StandardScaler()
user_train =  scalerUser.fit_transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))



In [4]:
print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


In [5]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (80668, 21)
movie/item test data shape: (20168, 21)


<a name="4"></a>
## 4 - Neural Network for content-based filtering

In [6]:
# GRADED_CELL
# UNQ_C1

mlflow.tensorflow.autolog()

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
  
  
    ### END CODE HERE ###  
])

item_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
  
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)  
  
    ### END CODE HERE ###  
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Metal device set to: Apple M1
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 18)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           41888       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42400       ['input_2[0][0]']                
                                                                

In [7]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
# lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.01, decay_steps=1000, decay_rate=0.96)

opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [8]:
# with mlflow.start_run() as run:
tf.random.set_seed(1)
model.fit([user_train[:, 1:], item_train[:, 1:]], y_train, epochs=10,batch_size=128)

2023/09/24 23:05:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '09c62b33d4d7445a85cbc1a6b0c25d1a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/10


2023-09-24 23:05:16.182082: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: /var/folders/lm/pf6bl2rd3vs5gtjdk6tsv7mh0000gn/T/tmpmvmkbt2n/model/data/model/assets


INFO:tensorflow:Assets written to: /var/folders/lm/pf6bl2rd3vs5gtjdk6tsv7mh0000gn/T/tmpmvmkbt2n/model/data/model/assets


<keras.callbacks.History at 0x295b58a30>

In [13]:
with mlflow.start_run() as run:
    model.evaluate([user_test[:, 1:], item_test[:, 1:]], y_test)



In [9]:
new_user_id = 50000
new_ave_rating = 5.0
new_year = 2002
new_action = 0.0
new_adventure = 5.0
new_animation = 5.0
new_childrens = 5.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_Film_Noir = 0.0
new_Musical = 0.0
new_War = 0.0
new_Western = 0.0

user_vec = np.array([[new_user_id,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller,
                      new_Film_Noir,new_Musical, new_War,new_Western]])

In [10]:
user_vec.shape

(1, 19)

In [11]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
 
y_p = model.predict([suser_vecs[:, 1:], sitem_vecs[:, 1:]])

# unscale y prediction 
# y_pu = scalerTarget.inverse_transform(y_p)

# # sort the results, highest prediction first
# sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
# sorted_ypu   = y_pu[sorted_index]
# sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

# print_pred_movies(y_p= sorted_ypu,user=1, item= sorted_items, movie_dict= movie_dict, maxcount = 10)
y_p.shape



(9724, 1)

In [12]:
y_p

array([[ 0.21416663],
       [-0.1850358 ],
       [-0.16582665],
       ...,
       [-0.5327512 ],
       [-0.25379378],
       [-0.25379378]], dtype=float32)

In [16]:
movie_dict = defaultdict(dict)
count = 0

In [17]:
with open('./data/ml-latest-small/content_movie_list.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for line in reader:
        if count == 0:
            count += 1  # skip header
            # print(line)
        else:
            count += 1
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]

In [18]:
def print_pred_movies(y_p, user, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    movies_listed = defaultdict(int)
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        if movie_id in movies_listed:
            continue
        movies_listed[movie_id] = 1
        disp.append([y_p[i, 0], item[i, 0].astype(int), item[i, 2].astype(float),
                    movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return(table)

In [19]:
print_pred_movies(y_p= sorted_ypu,user=1, item= sorted_items, movie_dict= movie_dict, maxcount = 10)

y_p,movie id,rating ave,title,genres
3.9562,1631,5.0,"Assignment, The (1997)",Action|Thriller
3.95451,2075,5.0,Mephisto (1981),Drama|War
3.83733,1151,5.0,Lesson Faust (1994),Animation|Comedy|Drama|Fantasy
3.8232,5008,4.38889,Witness for the Prosecution (1957),Drama|Mystery|Thriller
3.72979,1797,4.25,Everest (1998),Documentary|IMAX
3.70849,5747,4.28571,Gallipoli (1981),Drama|War
3.70544,136834,5.0,The Eye: Infinity (2005),Horror
3.70544,130970,5.0,George Carlin: Life Is Worth Losing (2005),Comedy
3.70544,121781,5.0,Stuart Little 3: Call of the Wild (2005),Animation|Children|Comedy|Fantasy
3.70544,122092,5.0,Guy X (2005),Comedy|War


In [20]:
# GRADED_FUNCTION: sq_dist
# UNQ_C2
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """
    ### START CODE HERE ###     
    d = np.sum(np.square(a-b))
    ### END CODE HERE ###     
    return d

In [21]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                42400     
                                                                 
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 42,400
Trainable params: 42,400
Non-trainable params: 0
_________________________________________________________________


In [22]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,1:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (9724, 32)


In [19]:
count = 0 # number of movies to display
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
        
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append( [movie_dict[movie1_id]['title'], movie_dict[movie1_id]['genres'],
                  movie_dict[movie2_id]['title'], movie_dict[movie1_id]['genres']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
table

KeyboardInterrupt: 

In [20]:
# GRADED_FUNCTION: sq_dist
# UNQ_C2
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """
    ### START CODE HERE ###     
    d = np.sum(np.square(a-b))
    ### END CODE HERE ###     
    return d

In [21]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 20)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                42400     
                                                                 
 tf.math.l2_normalize_3 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 42,400
Trainable params: 42,400
Non-trainable params: 0
_________________________________________________________________


In [22]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,1:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (9724, 32)


In [23]:
count = 5  # number of movies to display
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])


In [44]:
count = 10

In [45]:
        
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append( [movie_dict[movie1_id]['title'], movie_dict[movie1_id]['genres'],
                  movie_dict[movie2_id]['title'], movie_dict[movie1_id]['genres']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
table

movie1,genres,movie2,genres.1
Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Children|Comedy|Fantasy
Grumpier Old Men (1995),Comedy|Romance,"Addams Family, The (1991)",Comedy|Romance
Heat (1995),Action|Crime|Thriller,Eat Drink Man Woman (Yin shi nan nu) (1994),Action|Crime|Thriller
Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Léon: The Professional (a.k.a. The Professional) (Léon) (1994),Mystery|Thriller
"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"Sweet Hereafter, The (1997)",Crime|Mystery|Thriller
From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,Buffalo '66 (a.k.a. Buffalo 66) (1998),Action|Comedy|Horror|Thriller
Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,"Madness of King George, The (1994)",Adventure|Comedy|Crime|Romance
Braveheart (1995),Action|Drama|War,In the Mood For Love (Fa yeung nin wa) (2000),Action|Drama|War
Rob Roy (1995),Action|Drama|Romance|War,"Browning Version, The (1994)",Action|Drama|Romance|War
Canadian Bacon (1995),Comedy|War,Bushwhacked (1995),Comedy|War


In [None]:
import mlflow
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

In [52]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids = '634769499380944910',
    filter_string = "metrics.loss < 0.2",
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results = 2,
    order_by = ['metrics.loss ASC']

)


In [55]:
for run in runs:
    print(f"Run id: {run.info.run_id}, Loss: {run.data.metrics['loss']:.4f}")

Run id: 668624f5da394664bcc5285dc750d6e6, Loss: 0.1414
Run id: 1e7ebf9306a046b585e001eb5c66c07d, Loss: 0.1944


In [56]:
run_id = "668624f5da394664bcc5285dc750d6e6"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri,name='recommender_system')

Registered model 'recommender_system' already exists. Creating a new version of this model...
2023/08/20 18:47:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: recommender_system, version 3
Created version '3' of model 'recommender_system'.


<ModelVersion: aliases=[], creation_timestamp=1692553624804, current_stage='None', description='', last_updated_timestamp=1692553624804, name='recommender_system', run_id='668624f5da394664bcc5285dc750d6e6', run_link='', source='mlflow-artifacts:/634769499380944910/668624f5da394664bcc5285dc750d6e6/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='3'>