<a href="https://colab.research.google.com/github/CharistaSC/oop/blob/main/Week13_DL_based_RS_and_MLOps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 13 Lab: DL based RS and MLOps

## Section 1: Building a DL based RS

In [None]:
!pip install scikit-surprise

Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data: https://surpriselib.com/

In [None]:
from surprise import Dataset
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils import shuffle

### We first need to define a data loader

In [None]:
class Loader():
    current = 0

    def __init__(self, x, y, batch_size=2048, is_shuffle=True):
        from sklearn.utils import shuffle
        self.shuffle = shuffle
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.batches = # fill in

        # If is_shuffle, we re-shuffle on every epoch
        if is_shuffle:
            self.x, self.y = # fill in

    def __iter__(self):
        # Reset and return a new iterator
        self.x, self.y = shuffle(self.x, self.y, random_state=0)
        self.current = 0
        return self

    def __len__(self):
        # Return the number of batches
        return # fill in

    def __next__(self):
        n = self.batch_size

        if self.current + n >= len(self.y):
            raise StopIteration

        i = self.current
        xs = # fill in -- hint: using torch.from_numpy
        ys = # fill in

        self.current += n

        return (xs, ys)

### Let us build a Matrix Factorization (MF) model using PyTorch

(Optional) Homework: You can try other models such as Multi-layerd Perceptrons (MLP), Neural Collaborative Filtering (NeuMF)

In [None]:
class MF(nn.Module):

    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0):
        super(MF, self).__init__()
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We add new terms here:
        self.bias_user = # fill in
        self.bias_item = # fill in

        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]

        vector_user = # fill in
        vector_item = # fill in
        
        # Pull out biases
        bias_user = # fill in -- hint: remember to use .squeeze()
        bias_item = # fill in

        biases = # fill in
        
        ui_interaction = # fill in
        
        # Add bias prediction to the interaction prediction
        prediction = # fill in
        return prediction
    
    def loss(self, prediction, target):

        def l2_regularize(array):
            loss = torch.sum(array**2)
            return loss

        loss_mse = # fill in -- hint: using F.mse_loss and target.squeeze()
        
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        
        total = # fill in
        return total

### Finally, we define the main() function

In [None]:
def main():
    # Get data
    data = Dataset.load_builtin('ml-1m')
    trainset = data.build_full_trainset()
    uir = np.array([x for x in trainset.all_ratings()])

    train_x = test_x = uir[:,:2].astype(np.int64)
    train_y = test_y = uir[:,2].astype(np.float32)

    # Define parameters
    lr = 1e-1 # learning rate
    k = 10  # embedding size or latent dimension
    c_bias = 1e-6
    c_vector = 1e-6
    batch_size = 2048

    model = # fill in -- hint: call MF model
    
    optimizer = # fill in -- hint: use torch.optim.Adam
    dataloader = # fill in -- hint: call Loader method

    itr = 0
    for batch in dataloader:

        itr += 1

        prediction = model(batch[0])
        loss = model.loss(prediction,batch[1])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"iteration: {itr}. training loss: {loss}")

    torch.save(model.state_dict(), "./recommendation_model_pytorch.pkl")

if __name__=='__main__':
    main()

### Now we're ready to do inference

For inference, we will use most of the code from before (especially the model definition).

In [None]:
from surprise import Dataset
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import pprint

In [None]:
# This function aims to get the top-N recommendations

def get_top_n(model, testset, trainset, uid_input, movies_df, n=10):
    
    preds = []
    try:
        uid_input = int(trainset.to_inner_uid(uid_input))
    except KeyError:
        return preds        

    # We first map the predictions to each user.
    for uid, iid, _ in testset:
        try:
            uid_internal = int(trainset.to_inner_uid(uid))
        except KeyError:
            continue
            
        if uid_internal == uid_input:
            try:
                iid_internal = int(trainset.to_inner_iid(iid))
                movie_name = movies_df.loc[int(iid), 'name']
                preds.append(# fill in) -- hint: (item id, movie name, rating from model(torch.tensor([[uid_input, iid_internal]])))
            except KeyError:
                pass

    # Then, we sort the predictions for each user and retrieve the k highest ones (i.e., top-K recommendations)
    if preds is not None:
        preds.sort(key=lambda x: x[1], reverse=True)
        if len(preds) > n:
            preds = # fill in
    return preds

In [None]:
# This function aism to get previously seen items (i.e., items that user has already seen in the past)

def get_previously_seen(trainset, uid, movies_df):
    seen = []
    for (iid, _) in trainset.ur[int(uid)]:
        try:
            seen.append(# fill in)
        except KeyError:
            pass
        if len(seen) > 10:
            break
    return seen

In [None]:
def main():
    data = Dataset.load_builtin('ml-1m')

    import os
    files_dir = os.path.expanduser("/root/.surprise_data/ml-1m/ml-1m/")

    movies_df = pd.read_csv(files_dir + 'movies.dat', sep="::", header=None, engine='python', encoding='latin-1')
    movies_df.columns = ['iid','name','genre']
    movies_df.set_index('iid',inplace=True)

    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()

    k = 10 # embedding size or latent dimension
    c_bias = 1e-6
    c_vector = 1e-6

    model = MF(trainset.n_users, trainset.n_items, k=k, c_bias=c_bias, c_vector=c_vector)
    model.load_state_dict(torch.load('./recommendation_model_pytorch.pkl'))
    model.eval()

    # Let us print the recommended items for some sample users
    sample_users = # fill in

    for uid in sample_users:
        
        print('User:',uid)
        print('\n')

        print('\tSeen:')
        seen = # fill in
        pprint.pprint(seen)
        print('\n')

        print('\tRecommendations:')
        recommended = # fill in
        pprint.pprint([x[1] for x in recommended])
        print('\n')



if __name__=="__main__":
    main()

## Section 2: Model Serving and MLOps

Flask is a micro web framework written in Python. Function decorators are used in Flask to achieve routes to functions mapping. We first show how a simple service works, and then show how to load a model (e.g., based on pytorch) and serve it as well.

In [None]:
 # load Flask 
import flask
import time
import os

app = flask.Flask(__name__)

Unlike running Flask on a local machine, Google Colab provides you with a Virtual Machine (VM) whose localhost:5000 (Server Address for Flask Server and Default port) cannot be accessed directly. Therefore, the solution is to expose this address to a public url. To do this, a simple piece of code must be run before starting your server.

In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

In [None]:
start_time = time.time()

# Load data 
data = Dataset.load_builtin('ml-1m')
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

files_dir = os.path.expanduser("/root/.surprise_data/ml-1m/ml-1m/")

movies_df = pd.read_csv(files_dir + 'movies.dat', sep="::", header=None, engine='python', encoding='latin-1')
movies_df.columns = ['iid','name','genre']
movies_df.set_index('iid',inplace=True)

# Load model
k = 10
c_bias = 1e-6
c_vector = 1e-6

model = # fill in
model # fill in
model.eval()

print('Model and data preloading completed in ', time.time()-start_time)

@app.route("/", methods=["GET"])
def recommend():

    data = {"success": False}

    if "uid" in flask.request.args:

        data['uid'] = str(flask.request.args['uid'])

        try:
            data['seen'] = get_previously_seen(trainset, data['uid'], movies_df)
            recommended = get_top_n(model, testset, trainset, data['uid'], movies_df, n=10)
            print(recommended)
            data['recommended'] = [x[1] for x in recommended]
            data["success"] = True
        except:
            pass

    return flask.jsonify(data)
    
# start the flask app, allow remote connections
if __name__ == '__main__':
    app.run(host='0.0.0.0')

Now we launched a Flask web app, how can we get the recommendation results of user id 100, 196, 200, etc.?

(Optional) Homework: try to build another similar recommender system using Flask with Jester dataset: https://eigentaste.berkeley.edu/dataset/. You can also other use built-ind datasets available on surprise (https://surpriselib.com/)

# End of Lab!

Reference:

[1] University of Illinois Chicago. MLOps 2020.